In [14]:
import re
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# [4월 14일]
---

## # 뉴스그룹 데이터 토큰화
---

In [1]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle = True, random_state = 1,
                             remove = ('headers', 'footers', 'quotes'))

documents = dataset.data
print(len(documents))
documents[3]

11314


'Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it?  ATT\'s last product in this area (a) was priced over\n$1000, as I suspect \'clipper\' phones will be; (b) came to the customer \nwith the key automatically preregistered with government authorities. Thus,\naside from attempting to further legitimize and solidify the fed\'s posture,\nClipper seems to be "more of the same", rather than a new direction.\n   Yes, technology will eventually drive the cost down and thereby promote\nmore widespread use- but at present, the man on the street is not going\nto purchase a $1000 crypto telephone, especially when the guy on the other\nend probably doesn\'t have one anyway.  Am I missing something?\n   The real question is what the gov will do in a year or two when air-\ntight voice privacy on a phone line is as close as your nearest pc.  That\nhas got to a problematic scenario for them, even if the extent of usage\nnever surpasses the \'underground\' stature

In [15]:
# 토큰화 사용자 함수

# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')

def clean_text(d):
    pattern = r'[^a-zA-Z\s]'
    d = re.sub(pattern, '', d)
    return d

def clean_stopwords(d):
    stop_words = stopwords.words('english')
    return ' '.join([w.lower() for w in d.split() if w not in stop_words and len(w) > 3])

def tokenize(d):
    return word_tokenize(d)

In [16]:
import pandas as pd

news_df = pd.DataFrame({'article': documents})
display(news_df.head())
len(news_df)

Unnamed: 0,article
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


11314

In [17]:
news_df.replace('', float('Nan'), inplace = True)
news_df.dropna(inplace = True)
display(news_df.head())
len(news_df)

Unnamed: 0,article
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


11096

In [18]:
news_df['article'] = news_df['article'].apply(clean_text)
news_df['article']

0        Well im not sure about the story nad it did se...
1        \n\n\n\n\n\n\nYeah do you expect people to rea...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well I will have to change the scoring on my p...
                               ...                        
11309    Danny Rubenstein an Israeli journalist will be...
11310                                                   \n
11311    \nI agree  Home runs off Clemens are always me...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          \nNo arg...
Name: article, Length: 11096, dtype: object

In [20]:
news_df['article'] = news_df['article'].apply(clean_stopwords)
news_df['article']

0        well sure story seem biased disagree statement...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: article, Length: 11096, dtype: object

In [21]:
news_df['article'][168]

'clarify considered upgrade product called although commandline compiler lists version dosonly product compiler capable producing executables windows significant difference comes many windowshosted tools assist developers creating windows executables much quickly basically automating thousands lines boilerplate code initial impressions favorable everyone registered user received considerable amount info regarding specifics havent call microsoft sure theyd happy send'

In [22]:
tokenized_news = np.asarray(news_df['article'].apply(tokenize))
tokenized_news[168]

['animal', 'rights', 'people', 'know', 'bike', 'riding', 'dogscats', 'racoons']

In [23]:
import numpy as np

drop_news = np.asarray([index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1])
news_texts = np.delete(tokenized_news, drop_news, axis = 0)
print(len(news_texts))

10939


## # Word2Vec (gensim 모듈)
---
- sg 파라미터 0 : CBOW
- sg 파라미터 1 : Skip-gram
- size : 벡터의 크기

### # CBOW
---

#### # 모델 구성
---

In [28]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = news_texts,
                 window = 4,
                 size = 100,
                 min_count = 5,
                 workers = 4,
                 sg = 0)

#### # 유사도 측정
---

In [29]:
model.wv.similarity('king','queen')

0.90180784

In [30]:
model.wv.most_similar(positive = ['soldier'])

[('moslem', 0.9966962337493896),
 ('survivors', 0.9960116147994995),
 ('brutally', 0.9958341717720032),
 ('bodies', 0.9954071640968323),
 ('struggle', 0.9950077533721924),
 ('seized', 0.9947269558906555),
 ('carried', 0.9946165680885315),
 ('systematic', 0.9945094585418701),
 ('prison', 0.9943186640739441),
 ('fifteenth', 0.9943068027496338)]

In [31]:
model.wv.most_similar(positive = ['king', 'queen'], negative = ['woman'])

[('superman', 0.9967689514160156),
 ('compassop', 0.9950821995735168),
 ('dynamics', 0.9945865273475647),
 ('interplanetary', 0.9944478273391724),
 ('handbook', 0.9943879842758179),
 ('telecom', 0.993849515914917),
 ('seminar', 0.9938466548919678),
 ('labs', 0.9937224984169006),
 ('mccartney', 0.9936071634292603),
 ('pubxr', 0.9934794902801514)]

In [32]:
model.wv.most_similar(positive = ['male', 'female'], negative = ['prince'])

[('homes', 0.9945149421691895),
 ('mountains', 0.994161069393158),
 ('founders', 0.994027316570282),
 ('suspended', 0.9939975738525391),
 ('raids', 0.9939913153648376),
 ('testify', 0.9939495921134949),
 ('trampled', 0.993834376335144),
 ('remained', 0.9936137795448303),
 ('sixteenth', 0.9936009645462036),
 ('rockefellers', 0.993444561958313)]

### # Skip-gram
---

#### # 모델 구성
---

In [42]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = news_texts,
                 window = 3,
                 size = 100,
                 min_count = 3,
                 workers = 4,
                 sg = 1)

#### # 유사도 측정
---

In [43]:
model.wv.similarity('king','queen')

0.7384585

In [44]:
model.wv.most_similar(positive = ['soldier'])

[('azerbadjan', 0.9671629667282104),
 ('survivor', 0.9659083485603333),
 ('turkiye', 0.9646977186203003),
 ('hanged', 0.9639320969581604),
 ('fruits', 0.9631222486495972),
 ('tragedy', 0.9612021446228027),
 ('seized', 0.9611082673072815),
 ('witnessed', 0.9611005187034607),
 ('rome', 0.9605036973953247),
 ('palestine', 0.9598425626754761)]

In [45]:
model.wv.most_similar(positive = ['king', 'queen'], negative = ['woman'])

[('evans', 0.9282205104827881),
 ('jones', 0.9179049730300903),
 ('hughes', 0.9169968962669373),
 ('geoff', 0.9159521460533142),
 ('graham', 0.9119118452072144),
 ('davewoodcscoloradoedu', 0.9116646647453308),
 ('taylor', 0.9095369577407837),
 ('baker', 0.9076945185661316),
 ('andrewidacomhpcom', 0.9069969654083252),
 ('manson', 0.9064415097236633)]

In [46]:
model.wv.most_similar(positive = ['male', 'female'], negative = ['prince'])

[('failure', 0.8513232469558716),
 ('extreme', 0.8415439128875732),
 ('males', 0.8384928703308105),
 ('median', 0.8311585187911987),
 ('dominant', 0.8195927143096924),
 ('organism', 0.8192030191421509),
 ('accidental', 0.8187032341957092),
 ('fully', 0.8169630765914917),
 ('newly', 0.815092921257019),
 ('sect', 0.8120183348655701)]