In [2]:
import pyprind
import pandas as pd
import os

In [5]:
# 베이스경로 설정 -> 데이터가 있는 폴더
basepath = 'data/aclImdb'


labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                     'r', encoding='utf-8') as infile:
                txt = infile.read()
            
            df = df.append([[txt, labels[l]]], ignore_index = True)
            pbar.update()

df.columns =['review','sentiment']


  df = df.append([[txt, labels[l]]], ignore_index = True)
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:32


In [6]:
import numpy as np

In [7]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')

### 데이터 호출 및 확인

In [9]:
df = pd.read_csv('movie_data.csv', encoding ='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [11]:
df.shape

(50000, 2)

## 단어를 특성 벡터로 변환
- 사이킷런에 구현된 CountVectorize를 통해 구현 가능

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'
])
bag = count.fit_transform(docs)

In [14]:
# 각 단어들이 정수 인덱스와 매핑된 딕셔너리에 저장 
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [15]:
bag.toarray()

array([[0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 1],
       [2, 3, 2, 1, 1, 1, 2, 1, 1]])

## tf-idf(t,d)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

In [19]:
tfidf = TfidfTransformer(use_idf = True,
                        norm='l2',
                        smooth_idf=True)

np.set_printoptions(precision = 2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


## 텍스트 데이터 정제

In [21]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [25]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                         text)
    text = (re.sub('[\W]+',' ', text.lower())+ 
           " ".join(emoticons).replace('-',''))
    return text

In [26]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [27]:
preprocessor("</a>This :) is :( a test:-)!")

'this is a test :) :( :)'

In [28]:
df['review']= df.review.apply(preprocessor)

In [29]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [30]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [33]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] 
 if w not in stop] 

['runner', 'like', 'run', 'run', 'lot']

# 문서 분류를 위한 로지스틱 회귀 모델

In [34]:
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor = None)
param_grid = [{
    'vect__ngram_range' : [(1,1)],
    'vect__stop_words' : [stop, None],
    'vect__tokenizer' : [tokenizer, tokenizer_porter],
    'clf__penalty' : ['l1','l2'],
    'clf__C' : [1.0, 10.0, 100.0]},
    {'vect__ngram_range' : [(1,1)],
    'vect__stop_words' : [stop, None],
    'vect__tokenizer' : [tokenizer, tokenizer_porter],
    'vect__use_idf' : [False],
    'vect__norm': [None],
   'clf__penalty' : ['l1','l2'],
    'clf__C' : [1.0, 10.0, 100.0]
}]

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1)

In [42]:
gs_lr_tfidf.fit(X_train, y_train)









In [43]:
import warnings 
warnings.filterwarnings('ignore')

In [44]:
print('최적의 매개변수 조합: %s ' % gs_lr_tfidf.best_params_)
print('CV 정확도: %.3f' % gs_lr_tfidf.best_score_)

최적의 매개변수 조합: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7fb8b3da7c70>} 
CV 정확도: 0.897
