# Chapter 6. AI 기초 - NLP
* https://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import os
import pyprind
import pandas as pd

# Data

In [2]:
basepath = './Data/'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

  df = df.append([[txt, labels[l]]],
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:43


In [3]:
df.to_csv('movie_data.csv', index= False, encoding= "utf-8")

# BoW(Bag of Word)

In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
CntVec = CountVectorizer()
bag = CntVec.fit_transform(df['review'][:10000])

CntVec

In [6]:
CntVec.vocabulary_

{'went': 47831,
 'and': 2096,
 'saw': 37999,
 'this': 43869,
 'movie': 29018,
 'last': 25015,
 'night': 29932,
 'after': 1336,
 'being': 4366,
 'coaxed': 8587,
 'to': 44240,
 'by': 6521,
 'few': 16275,
 'friends': 17458,
 'of': 30611,
 'mine': 28190,
 'll': 25829,
 'admit': 1131,
 'that': 43712,
 'was': 47558,
 'reluctant': 36027,
 'see': 38577,
 'it': 23163,
 'because': 4220,
 'from': 17512,
 'what': 47895,
 'knew': 24453,
 'ashton': 2935,
 'kutcher': 24688,
 'he': 20128,
 'only': 30804,
 'able': 708,
 'do': 12897,
 'comedy': 8874,
 'wrong': 48660,
 'played': 33059,
 'the': 43721,
 'character': 7590,
 'jake': 23294,
 'fischer': 16541,
 'very': 46860,
 'well': 47805,
 'kevin': 24199,
 'costner': 9969,
 'ben': 4464,
 'randall': 35173,
 'with': 48332,
 'such': 42293,
 'professionalism': 34136,
 'sign': 39585,
 'good': 18744,
 'is': 23105,
 'can': 6745,
 'toy': 44569,
 'our': 31129,
 'emotions': 14391,
 'one': 30788,
 'did': 12209,
 'exactly': 15232,
 'entire': 14730,
 'theater': 43727,
 

In [7]:
sentense = ['The apple is delicious',
            'The banana is sweet', 
            'One aplle and two banana please.']

CntVec = CountVectorizer()
bag = CntVec.fit_transform(sentense)

CntVec

In [8]:
CntVec.vocabulary_

{'the': 9,
 'apple': 2,
 'is': 5,
 'delicious': 4,
 'banana': 3,
 'sweet': 8,
 'one': 6,
 'aplle': 1,
 'and': 0,
 'two': 10,
 'please': 7}

In [9]:
tmp = bag.toarray()
tmp

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
       [1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [10]:
tmp.shape

(3, 11)

# TF-IDF(단어 빈도 - 역문서 빈도)
* TF(t, d) : 단어 t가 문서 d에 등장하는 횟수(단어의 횟수)
* DF(t, d) : 단어 t가 포함된 문서 d의 수
* IDF(t, d) : 1/DF(t, d) - 자주 사용되지 않을 때 IDF 값이 커짐
* TF-IDF : TF * IDF
* 스팸 메일 분류하는 데 많이 사용

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

In [12]:
tfidf = TfidfTransformer(use_idf= True,
                         norm= 'l2',
                         smooth_idf= True)
tfidf.fit_transform(tmp).toarray()

array([[0.        , 0.        , 0.5628291 , 0.        , 0.5628291 ,
        0.42804604, 0.        , 0.        , 0.        , 0.42804604,
        0.        ],
       [0.        , 0.        , 0.        , 0.45985353, 0.        ,
        0.45985353, 0.        , 0.        , 0.60465213, 0.45985353,
        0.        ],
       [0.42339448, 0.42339448, 0.        , 0.32200242, 0.        ,
        0.        , 0.42339448, 0.42339448, 0.        , 0.        ,
        0.42339448]])

## Data Split

In [13]:
X_train = df.loc[:25000, 'review'].values
X_test = df.loc[:25000, 'review'].values

Y_train = df.loc[:25000, 'sentiment'].values
Y_test = df.loc[:25000, 'sentiment'].values

## Modeling

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(strip_accents= None,
                        lowercase= False,
                        preprocessor= None,
                        stop_words= None,
                        ngram_range= (1,1))

tfidf_lr = Pipeline([('tfidf', tfidf),
                     ('lr', LogisticRegression(solver= 'liblinear',
                                               C= 10,
                                               penalty= 'l2',
                                               random_state= 0))])

In [16]:
tfidf_lr.fit(X_train, Y_train)

## Evaluation

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [18]:
pred = tfidf_lr.predict(X_test)

print('F1-Score : {:.4f}'.format(f1_score(Y_test, pred)))
print('Accuracy Score : {:.4f}'.format(accuracy_score(Y_test, pred)))
print('\nConfusion Matrix:\n',confusion_matrix(Y_test, pred))

F1-Score : 0.9924
Accuracy Score : 0.9924

Confusion Matrix:
 [[12401    99]
 [   91 12410]]
