In [19]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [29]:
data= fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data
print(data[0])
# convert the text to a tf-idf weighted term-document matrix
vectorizer = TfidfVectorizer(max_features=2000, min_df=10, stop_words='english')
X = vectorizer.fit_transform(data)
idx_to_word = np.array(vectorizer.get_feature_names())

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [3]:
# apply NMF
nmf = NMF(n_components=20, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_
 
# print the topics
for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ","
                                .join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: want,said,make,say,right,way,really,did,good,time
Topic 2: appreciated,information,email,help,info,looking,hi,advance,mail,thanks
Topic 3: lord,church,christians,christian,believe,faith,christ,bible,jesus,god
Topic 4: algorithm,public,escrow,use,government,keys,clipper,encryption,chip,key
Topic 5: mac,cd,floppy,controller,ide,hard,drives,disk,scsi,drive
Topic 6: 50,20,price,condition,offer,shipping,10,new,sale,00
Topic 7: problem,running,using,use,program,window,files,dos,file,windows
Topic 8: teams,win,hockey,play,players,season,year,games,team,game
Topic 9: pub,ftp,cc,university,cs,soon,banks,gordon,pitt,edu
Topic 10: oil,new,speed,miles,good,dealer,engine,bike,cars,car
Topic 11: ram,color,bus,driver,vga,cards,drivers,monitor,video,card
Topic 12: appreciated,ve,work,doesn,help,program,mean,anybody,know,does
Topic 13: months,couple,bike,yeah,tried,just,heard,seen,got,ve
Topic 14: want,guns,rights,right,gun,think,government,know,people,don
Topic 15: group,mean,bike,oh,wanted,t

## 문서 유사도 검사

In [33]:
# 20개의 뉴스 그룹, 20개의 토픽으로 18000개의 포스팅
# 이메일 텍스트 형식, 제목, 날짜등의 헤더 정보와 이메일 내용으로 구성
from sklearn.datasets import fetch_20newsgroups
import io
import pandas as pd

newsgroups_train = fetch_20newsgroups(subset='train')


In [7]:

def parseDocument(data):
    buf = io.StringIO(data)
    line=buf.readline()
    data=[]
    subject=''
    while line:
        if(line.startswith('Subject:')):
            subject = line[8:].strip()
        elif (line.startswith('Lines:')):
            lines = line[6:]
            while line :
                line = buf.readline()
                data.append(line)
        line=buf.readline()
    text = ''.join(data)
    
    return subject,text


In [38]:
textlist = []
df = pd.DataFrame(columns=['text'])
for data in newsgroups_train.data[0:100]:
    subject,text = parseDocument(data)
    df.loc[subject]=text
df.head()


Unnamed: 0,text
WHAT car is this!?,\n I was wondering if anyone out there could e...
SI Clock Poll - Final Call,NNTP-Posting-Host: carson.u.washington.edu\n\n...
PB questions...,"\nwell folks, my mac plus finally gave up the ..."
Re: Weitek P9000 ?,Distribution: world\nNNTP-Posting-Host: amber....
Re: Shuttle Launch Question,"\nFrom article <C5owCB.n3p@world.std.com>, by ..."


### Tfidf를 이용한 단어 벡터화

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'].tolist())
print(vectors.shape)


(100, 6194)


### NMF를 이용하여 본문에서 특성 추출

In [40]:
from sklearn.decomposition import NMF

vector_array = vectors.toarray()
nmf = NMF(n_components=40)
nmf.fit(vector_array)
features = nmf.transform(vector_array)



In [43]:
print(features.shape)
print(features[0])

(100, 40)
[0.03321494 0.02420649 0.         0.00772843 0.         0.00270331
 0.00844475 0.09678481 0.03956872 0.         0.         0.00419682
 0.         0.         0.         0.         0.         0.01880104
 0.00628367 0.00074428 0.         0.01878573 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.01034834
 0.04420581 0.00102825 0.         0.01920032]


### Feature 정규화 -> Normalizer

In [12]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
norm_features=normalizer.fit_transform(features)

print(norm_features[0:2])

[[0.0099826  0.         0.         0.         0.         0.10519253
  0.         0.         0.         0.         0.         0.
  0.         0.         0.00367699 0.         0.         0.
  0.         0.99427135 0.         0.         0.         0.00322383
  0.         0.         0.         0.         0.         0.01534436
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.15626867 0.05739289 0.         0.00719865 0.10545099 0.34397155
  0.32620129 0.01080693 0.         0.         0.07025589 0.
  0.04008522 0.         0.00280546 0.         0.         0.
  0.         0.00825205 0.         0.07892168 0.         0.
  0.         0.         0.         0.         0.01072896 0.
  0.         0.         0.         0.01407957 0.74113397 0.08671768
  0.40732854 0.         0.         0.01993002]]


In [14]:
df_features = pd.DataFrame(norm_features,index=df.index.tolist())
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
WHAT car is this!?,0.009983,0.000000,0.000000,0.000000,0.000000,0.105193,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
SI Clock Poll - Final Call,0.156269,0.057393,0.000000,0.007199,0.105451,0.343972,0.326201,0.010807,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.014080,0.741134,0.086718,0.407329,0.000000,0.000000,0.019930
PB questions...,0.726192,0.077977,0.038312,0.087823,0.000000,0.146344,0.000000,0.008768,0.117020,0.384142,...,0.065897,0.000000,0.000000,0.127831,0.206073,0.063482,0.000000,0.000000,0.099691,0.027466
Re: Weitek P9000 ?,0.169435,0.055909,0.000000,0.000000,0.000000,0.083417,0.633527,0.000000,0.000000,0.113903,...,0.045244,0.000000,0.017631,0.000000,0.000000,0.196740,0.000000,0.000000,0.000000,0.000000
Re: Shuttle Launch Question,0.000000,0.000000,0.000000,0.071310,0.000000,0.007946,0.000000,0.000000,0.611128,0.000000,...,0.108425,0.000000,0.134026,0.222538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Terminal for sale,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.919494,0.000000,0.000000,...,0.042615,0.000000,0.000000,0.000000,0.390787,0.000000,0.000000,0.000000,0.000000,0.000000
Re: Remember those names come election time.,0.161132,0.054927,0.000000,0.015909,0.323985,0.040756,0.000000,0.028336,0.642329,0.000000,...,0.000000,0.005402,0.000000,0.000000,0.184519,0.365251,0.000000,0.000000,0.209578,0.088016
Re: Interesting ADB behaviour on C650,0.607728,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028067,0.000000,0.705174,...,0.056794,0.000000,0.000000,0.102355,0.305295,0.000000,0.000000,0.000000,0.000000,0.000000
"Re: request for information on ""essential tremor"" and Indrol?",0.016922,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.031976,0.000000,0.999052,0.000000,0.000000,0.000000,0.007268,0.000000,0.000000,0.000000


### 문서 유사도 계산 (코사인 유사도)

In [17]:
article = df_features.loc['Terminal for sale'] # “Terminal for sale” 라는 문서의 유사한 문서 검색
similarities=df_features.dot(article) # 각 문서의 특성 행렬과 article 문서의 특성 행렬 곱하기
top=similarities.nlargest() # 큰 값순으로 정렬

In [18]:
texts = df.loc[top.index]['text'].tolist()
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    #print(text+'\n')
    i = i+1

TITLE :Terminal for sale Similarities:1.0000000000000002
TITLE :commodoree Similarities:0.9461851355572825
TITLE :FOR SALE: ULTRABOTS PC GAME Similarities:0.9108723021656145
TITLE :BIKE FOR SALE ... 1986 Harley FLHTC Similarities:0.9101275075499694
TITLE :XT clone for sale Similarities:0.9100088210355614
