## 1. 데이터 준비

In [1]:
#pip install gensim

In [2]:
import pandas as pd
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\125\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##### ♦️1. 실습예제
- 리뷰가 30글자 미만인 데이터 제거
- 추가 전처리를 적용한 코드 작성
- 정규표현식을 이용해 이모티콘 및 특수문자제거
- 명사, 동사 불용어제거까지 적용된 document 리스트 만들기

* #명사(NN, NNS, NNP, NNPS)
* #동사(VB, VBD, VBG, VBN, VBP, VBZ)

### 1.1 추가 전처리 (lemmatizer or stemmer)
- lemmatizer : 어근을 찾아 원형변형, 표제어로 변경 (running -> run, better -> good)
- stemmer : 어간 추출을 통해 접미사를 제거 (처리속도가 빠르지만 의미왜곡 우려)

In [3]:
df = pd.read_csv('./Data/영어 크롤링.csv')

In [4]:
df.head()

Unnamed: 0,Date,Name,Score,Review,like
0,"January 26, 2020",Bernard Reis,1,Completely frustrating experience. Paid extra ...,7
1,"January 18, 2020",Gordon Inman,2,"Some good features, but this app isn't very sm...",8
2,"January 18, 2020",MrDurma,4,"This app works perfectly, it allowed us to dow...",2
3,"January 26, 2020",K Kostas,3,Good but not great for an app by LG. Not the m...,1
4,"January 17, 2020",Greg Evans,3,App is useful but limited and requires some up...,1


In [5]:
df = df[df['Review'].apply(lambda x : len(x) >= 30)]

In [6]:
df.head()

Unnamed: 0,Date,Name,Score,Review,like
0,"January 26, 2020",Bernard Reis,1,Completely frustrating experience. Paid extra ...,7
1,"January 18, 2020",Gordon Inman,2,"Some good features, but this app isn't very sm...",8
2,"January 18, 2020",MrDurma,4,"This app works perfectly, it allowed us to dow...",2
3,"January 26, 2020",K Kostas,3,Good but not great for an app by LG. Not the m...,1
4,"January 17, 2020",Greg Evans,3,App is useful but limited and requires some up...,1


In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
import re
def pos_tagging(string):

    #알파벳 아닌것 처리
    not_eng = re.compile('[^a-zA-Z]')
    string = re.sub(not_eng, ' ', string)

    # 공백 처리
    not_eng2 = re.compile('\s+')
    string2 = re.sub(not_eng2 , ' ',string)

    #불용어처리
    string3 = string2.lower()
    string3 = nltk.word_tokenize(string3)
    string3 = nltk.pos_tag(string3)

    result = [w for w,t in string3 if t in ['NN', 'NNS', 'NNP', 'NNPS','VB','VBD', 'VBG', 'VBN', 'VBP','VBZ'] and w not in stopwords]
    
    return result

In [9]:
df['Review_words'] = df['Review'].apply(lambda x : pos_tagging(x))

In [10]:
tagged_pos_document = df['Review_words']

In [11]:
tagged_pos_document

0       [experience, paid, money, get, air, conditione...
1       [features, think, connect, device, give, ux, d...
2       [app, works, allowed, download, cycles, washer...
3       [app, lg, interface, adjustments, user, downlo...
4       [app, requires, updates, wifi, setup, took, ap...
                              ...                        
6941                                 [open, app, crashes]
6944                [reference, application, application]
6948                         [installing, showing, error]
6949                                      [apps, tnx, lg]
7041                                                   []
Name: Review_words, Length: 4870, dtype: object

In [12]:
df = df[df['Review_words'].apply(lambda x : len(x) != 0)]

In [13]:
df.reset_index(inplace = True)

In [14]:
df.drop('index', axis = 1 , inplace = True)

In [15]:
df

Unnamed: 0,Date,Name,Score,Review,like,Review_words
0,"January 26, 2020",Bernard Reis,1,Completely frustrating experience. Paid extra ...,7,"[experience, paid, money, get, air, conditione..."
1,"January 18, 2020",Gordon Inman,2,"Some good features, but this app isn't very sm...",8,"[features, think, connect, device, give, ux, d..."
2,"January 18, 2020",MrDurma,4,"This app works perfectly, it allowed us to dow...",2,"[app, works, allowed, download, cycles, washer..."
3,"January 26, 2020",K Kostas,3,Good but not great for an app by LG. Not the m...,1,"[app, lg, interface, adjustments, user, downlo..."
4,"January 17, 2020",Greg Evans,3,App is useful but limited and requires some up...,1,"[app, requires, updates, wifi, setup, took, ap..."
...,...,...,...,...,...,...
4837,"May 11, 2018",A Google user,5,It is very good and useful app in my lifr,0,"[app, lifr]"
4838,"October 2, 2017",repou Ltd,1,"Can't even open the app, crashes.",0,"[open, app, crashes]"
4839,"July 22, 2018",jayanth murli,5,It is well helpfull for my reference this appl...,0,"[reference, application, application]"
4840,"May 29, 2018",Mukul,1,It's not installing showing a 505 error....,0,"[installing, showing, error]"


## 2. LDA 데이터 세팅

In [16]:
import gensim
from gensim import corpora, models

In [17]:
dictionary = corpora.Dictionary(tagged_pos_document)

In [18]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000)
dictionary.compactify()

In [19]:
dictionary.token2id # 단어를 id로 인코딩

{'air': 0,
 'app': 1,
 'cache': 2,
 'cleared': 3,
 'conditioner': 4,
 'connect': 5,
 'connectivity': 6,
 'device': 7,
 'disconnected': 8,
 'experience': 9,
 'frustrated': 10,
 'get': 11,
 'gets': 12,
 'minute': 13,
 'money': 14,
 'nothing': 15,
 'paid': 16,
 'reinstalling': 17,
 'replaced': 18,
 'reset': 19,
 'router': 20,
 'takes': 21,
 'times': 22,
 'try': 23,
 'use': 24,
 'work': 25,
 'works': 26,
 'connected': 27,
 'directions': 28,
 'effort': 29,
 'features': 30,
 'give': 31,
 'look': 32,
 'match': 33,
 'product': 34,
 'purchased': 35,
 'put': 36,
 'rate': 37,
 'see': 38,
 'start': 39,
 'stove': 40,
 'think': 41,
 'used': 42,
 'ux': 43,
 'written': 44,
 'allowed': 45,
 'cool': 46,
 'cycles': 47,
 'download': 48,
 'dryer': 49,
 'electricity': 50,
 'info': 51,
 'kind': 52,
 'know': 53,
 'kwh': 54,
 'make': 55,
 'monitoring': 56,
 'power': 57,
 'problems': 58,
 'rating': 59,
 'section': 60,
 'show': 61,
 'shows': 62,
 'supposed': 63,
 'thing': 64,
 'washer': 65,
 'add': 66,
 'bit': 6

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in tagged_pos_document]

In [21]:
corpus[0] # (id, 빈도)

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 2),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 2),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1)]

## 3. LDA 토픽 수 설정하기

In [22]:
top_n = 3

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= top_n, id2word= dictionary)

In [23]:
ldamodel.print_topics(num_words = 10)

[(0,
  '0.071*"app" + 0.026*"lg" + 0.023*"phone" + 0.015*"get" + 0.014*"access" + 0.014*"work" + 0.011*"connect" + 0.011*"device" + 0.010*"use" + 0.010*"washer"'),
 (1,
  '0.092*"app" + 0.024*"washer" + 0.020*"machine" + 0.018*"dryer" + 0.017*"work" + 0.016*"washing" + 0.015*"use" + 0.013*"connect" + 0.013*"time" + 0.013*"lg"'),
 (2,
  '0.042*"app" + 0.025*"phone" + 0.022*"permissions" + 0.019*"lg" + 0.018*"washer" + 0.017*"work" + 0.013*"use" + 0.013*"dryer" + 0.012*"need" + 0.012*"cycle"')]

## 4. LDA 토픽 수 찾기
- Coherence & perplexity
  * Coherence : 0~1 (클수록 굳)
  * Perplexity : 음수일수록 (작을수록 굳) 

In [24]:
from gensim.models import CoherenceModel

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= 3, id2word= dictionary)
coherencemodel = CoherenceModel(model = ldamodel , texts = tagged_pos_document , dictionary= dictionary)

In [25]:
coherencescore = coherencemodel.get_coherence()

In [26]:
coherencescore 

0.46100652914305035

In [27]:
ldamodel.log_perplexity(corpus) 

-6.346379694383892

### 4.1 Coherence

In [28]:
#pip install matplotlib

In [29]:
import matplotlib.pyplot as plt

In [30]:
from tqdm import tqdm

c_score = []
p_score = []
for i in tqdm(range(2,7)):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics= i, id2word= dictionary)
    coherencemodel = CoherenceModel(model = ldamodel , texts = tagged_pos_document , dictionary= dictionary, coherence='u_mass')
    c_score.append(coherencemodel.get_coherence())
    p_score.append(ldamodel.log_perplexity(corpus) )
    

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:32<00:00, 18.56s/it]


In [None]:
x = range(2,7) 
plt.plot(x , c_score)
plt.show()

### 4.2 Perplexity

In [None]:
plt.plot(x,p_score)
plt.show()

### 4.3 설정된 토픽 수로 LDA 적용

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4 , id2word = dictionary)

In [None]:
lda_model.get_document_topics(corpus)[2]

In [None]:
label = [2,3]
value = [0.7 , 0.2]

## 5. 설정된 토픽 수로 LDA 적용

In [None]:
for doc in lda_model.get_document_topics(corpus):
    label = []
    value = []

    for topic, percent in doc:
        label.append(topic)
        value.append(percent)

    index = np.argmax(value)
    topic_list.append(label[index])

In [None]:
df['topic'] = topic_list

In [None]:
lda_model.print_topics(num_words = 8)

In [None]:
import sys, platform

print("Python 버전 :", sys.version)
print("실행 파일 경로 :", sys.executable)
print("플랫폼 :", platform.platform())