# 1. Load Libraries

In [1]:
import utils
import pandas as pd



# 2. Load Review files
- Amazon review data [Link](http://jmcauley.ucsd.edu/data/amazon/) 
- Automotive, Musical Instruments, Patio & Lawn and Garden

In [2]:
# 파이썬에서 작업하기 적합한 형태로 리뷰파일 변환
train_reviews, test_reviews = utils.prepare_reviews()

In [3]:
# 읽어온 데이터 확인
train_df = pd.DataFrame(train_reviews, columns=['review', 'rating', 'sentiment', 'category'])
train_df.tail()

Unnamed: 0,review,rating,sentiment,category
39840,received this product free of charge in exchan...,5.0,good,garden
39841,these tongs were provided to me by fresher pro...,5.0,good,garden
39842,received this item free of charge big sturdy a...,5.0,good,garden
39843,these grill tongs are great love how comfortab...,5.0,good,garden
39844,big and easy to use durable and made of the hi...,5.0,good,garden


In [4]:
# 전체 문서의 84.83%가 긍정문서
train_df['sentiment'].value_counts()

good    37057
bad      2788
Name: sentiment, dtype: int64

# 3. Prepare Doc2Vec input
- TaggedDocument 객체 생성 필요
- words = 토큰단위로 나눠진 List
  - ['단어1', '단어2', ..., '단어n]
- tags = 문서번호 및 문서 카테고리
  - ['문서번호', '카테고리1', '카테고리2', ..., '카테고리n']

In [5]:
# Doc2Vec에 사용하기 적합한 형태로 변환
encoded_reviews = utils.doc2vec_labeler(train_reviews)
encoded_reviews[0]

TaggedDocument(words=['needed', 'set', 'of', 'jumper', 'cables', 'for', 'my', 'new', 'car', 'and', 'these', 'had', 'good', 'reviews', 'and', 'were', 'at', 'good', 'price', 'they', 'have', 'been', 'used', 'few', 'times', 'already', 'and', 'do', 'what', 'they', 'are', 'supposed', 'to', 'no', 'complaints', 'there', 'what', 'will', 'say', 'is', 'that', 'feet', 'really', 'isn', 'an', 'ideal', 'length', 'sure', 'if', 'you', 'pull', 'up', 'front', 'bumper', 'to', 'front', 'bumper', 'they', 'are', 'plenty', 'long', 'but', 'lot', 'of', 'times', 'you', 'will', 'be', 'beside', 'another', 'car', 'or', 'can', 'get', 'really', 'close', 'because', 'of', 'this', 'would', 'recommend', 'something', 'little', 'longer', 'than', 'great', 'brand', 'get', 'version', 'though'], tags=['d0', 'good'])

# 4. Set Doc2Vec object

In [6]:
# Doc2Vec 인스턴스 생성
model = utils.make_doc2vec_object()

# Doc2Vec 인스턴스에서 말뭉치 단어사전 생성
model.build_vocab(encoded_reviews)
print("말뭉치의 단어 수:", len(model.wv.vocab))

말뭉치의 단어 수: 19158


In [7]:
# Doc2Vec 모델 훈련
model.train(encoded_reviews, total_examples=model.corpus_count, epochs=10)

# 5. Check the result
- return [(유사한 단어1, 유사한 점수1), ..., (유사한 단어n, 유사한 정도n)]

In [8]:
# 입력단어와 유사한 단어 검색
model.wv.most_similar('car')

[('softening', 0.5707731246948242),
 ('stringing', 0.5525894165039062),
 ('kmart', 0.5330182909965515),
 ('wishlist', 0.5068528056144714),
 ('refill', 0.4978862702846527),
 ('dobie', 0.4951757490634918),
 ('whoa', 0.4859882593154907),
 ('technological', 0.4750080406665802),
 ('misc', 0.4693489670753479),
 ('taylor', 0.46787726879119873)]

In [9]:
similar_docs = utils.get_similar_doc(test_reviews[1], train_reviews, model)
similar_docs

Review: 
 lucky is any car that comes to be our vehicle my husband is fastidious with the upkeep of our cars after rain our cars have been dried off in the garage nature carwash waxing etc is done on regular basis when off to carwash that facility must not use any harsh or abrasive equipment most times my hubby does the job himself pending the weather he has praised the microfiber drying towel and its pockets as well as the blade for accomplishing the job with more ease towel and blade are perfect for our vehicles 

Rating: 5.0
Sentiment: good
Category: auto


Unnamed: 0,index,review,rating,sentiment,category,score
0,27726,have had of these tuners you think learn after...,1.0,bad,music,0.63336
1,24814,am very pleased with the quality of this stand...,5.0,good,music,0.535001
2,22325,great strings use these on deering goodtime ii...,5.0,good,music,0.531501
3,33035,hooked it up to my and lb tank works great wis...,5.0,good,garden,0.516991
4,19358,for the money have no complaints the enclosure...,4.0,good,music,0.478231


In [10]:
utils.get_accuracy(test_reviews, train_reviews, model)

281

# 6. With sampled data

In [11]:
pos_sampled = train_df[train_df['sentiment']=='good'].sample(2788)
neg_sampled = train_df[train_df['sentiment']=='bad']
sampled_df = pd.concat([pos_sampled, neg_sampled])
sampled_df.index = range(len(sampled_df))
sampled_df.tail()

Unnamed: 0,review,rating,sentiment,category
5571,love the idea and have used an expanding hose ...,2.0,bad,garden
5572,all of these expanding hoses have the same pro...,2.0,bad,garden
5573,as with many of these types of hoses there are...,1.0,bad,garden
5574,look it rare that have trouble assembling prod...,2.0,bad,garden
5575,see update below for why knocked it down to st...,2.0,bad,garden


In [12]:
sampled_df['sentiment'].value_counts()

bad     2788
good    2788
Name: sentiment, dtype: int64

In [13]:
sampled_reviews = list()
for i in range(len(sampled_df)):
    doc = sampled_df.iloc[i]
    doc_tuple = tuple(doc.tolist())
    sampled_reviews.append(doc_tuple)

In [14]:
# 데이터 인코딩
encoded_reviews = utils.doc2vec_labeler(sampled_reviews)

# Doc2Vec 인스턴스 생성
model = utils.make_doc2vec_object()

# Doc2Vec 인스턴스에서 말뭉치 단어사전 생성
model.build_vocab(encoded_reviews)
print("말뭉치의 단어 수:", len(model.wv.vocab))

말뭉치의 단어 수: 8728


In [15]:
# 모델 학습
model.train(encoded_reviews, total_examples=model.corpus_count, epochs=10)

In [16]:
similar_docs = utils.get_similar_doc(test_reviews[3], sampled_reviews, model)
similar_docs

Review: 
 let be very candid this is very good drying towel however most of the towels use to dry off one of the cars work equally as well this towel is made of microfiber materials and is very soft and carefully noted that on my corvette it did not leave any cloth marks however that car does have great wax finish and scraping that wax would not be easy the towel does absorb good and the size is very good have yet to figure out the use of those pickets on both ends the manufacturer writes that you can put your hands in those pockets and you get better grip on the towel maybe maybe not the towel washed easily by just tossing in with the rest of your machine washables this is good product 

Rating: 5.0
Sentiment: good
Category: auto


Unnamed: 0,index,review,rating,sentiment,category,score
0,1285,for the price this is one of the best pop filt...,4.0,good,music,0.509794
1,1234,of the four only one actually popped right on ...,5.0,good,auto,0.478094
2,2348,bought this tool because of the reviews the re...,5.0,good,garden,0.474698
3,242,best polish have found,5.0,good,auto,0.462356
4,1202,great quality casters for the money just as go...,5.0,good,music,0.453389


In [17]:
utils.get_accuracy(test_reviews, sampled_reviews, model)

148