In [4]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Ниже будет предобработка текста и прогон генсима.

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [8]:
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [89]:
print(data_words[:2])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst'], ['from', 'guy', 'kuo', 'subje

In [21]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [22]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [114]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
spacy.load('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp', 'posting', 'host', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [115]:
del df['data_lemmatized']

In [116]:
del df['topic']

In [117]:
df['data_lemmatized'] = data_lemmatized

In [25]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)]]


In [26]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [34]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [35]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.175*"file" + 0.073*"entry" + 0.057*"error" + 0.053*"display" + '
  '0.040*"program" + 0.030*"sun" + 0.025*"version" + 0.024*"cool" + '
  '0.020*"output" + 0.020*"crash"'),
 (1,
  '0.032*"would" + 0.026*"say" + 0.022*"think" + 0.022*"people" + 0.020*"go" + '
  '0.018*"know" + 0.016*"make" + 0.016*"see" + 0.014*"come" + 0.013*"thing"'),
 (2,
  '0.091*"evidence" + 0.048*"book" + 0.041*"faith" + 0.037*"reason" + '
  '0.033*"exist" + 0.032*"claim" + 0.031*"religion" + 0.029*"believe" + '
  '0.027*"christian" + 0.023*"church"'),
 (3,
  '0.541*"ax" + 0.054*"car" + 0.028*"player" + 0.014*"engine" + 0.009*"dealer" '
  '+ 0.009*"mile" + 0.009*"expensive" + 0.009*"here" + 0.008*"extra" + '
  '0.008*"specify"'),
 (4,
  '0.087*"belief" + 0.084*"internet" + 0.079*"atheist" + 0.061*"distribution" '
  '+ 0.053*"wing" + 0.041*"atheism" + 0.038*"printer" + 0.034*"interface" + '
  '0.026*"multiple" + 0.020*"thinking"'),
 (5,
  '0.088*"normal" + 0.072*"port" + 0.071*"mouse" + 0.053*"hole" + 0.03

Маллет у меня отказался работать. Джава устанавливаться не хочет.

In [52]:
mallet_path = '/Users/apple/Downloads/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=id2word)

CalledProcessError: Command '/Users/apple/Downloads/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /var/folders/w3/h6phwtbd05b0gw2h4zhrnthr0000gn/T/9e9801_corpus.txt --output /var/folders/w3/h6phwtbd05b0gw2h4zhrnthr0000gn/T/9e9801_corpus.mallet' returned non-zero exit status 1.

Создаем функцию, выбирающую наилучшее количество тем для имеющихся у нас данных. Я ставлю значения от 1 до 36 и шаг 5, иначе программа будет работать невозможно долго.

In [50]:
def choose_the_best(text, dicti, corpus):
    b = 0
    for i in range(1,36,5):
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=i, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=dicti, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        if coherence_lda > b:
            b = coherence_lda
            best = i
    return i

Прогоняем функцию и смотрим лучший вариант.

In [51]:
print(choose_the_best(data_lemmatized, id2word, corpus))

31


Далее работаем с ним

In [53]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=31, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4797184706598412


In [55]:
lda_model.show_topics(formatted=False, num_topics=31)

[(0,
  [('program', 0.07611365),
   ('file', 0.07137648),
   ('window', 0.056231245),
   ('run', 0.03422214),
   ('set', 0.031260442),
   ('entry', 0.029877502),
   ('application', 0.025770914),
   ('problem', 0.0245313),
   ('display', 0.021411153),
   ('use', 0.020050686)]),
 (1,
  [('write', 0.039949562),
   ('would', 0.032477118),
   ('article', 0.028458767),
   ('may', 0.024042275),
   ('make', 0.023390925),
   ('people', 0.020415578),
   ('many', 0.013441859),
   ('point', 0.01299268),
   ('question', 0.010302959),
   ('mean', 0.010044186)]),
 (2,
  [('image', 0.10383521),
   ('color', 0.09797277),
   ('package', 0.06891997),
   ('character', 0.062150832),
   ('picture', 0.055843342),
   ('normal', 0.0537578),
   ('tool', 0.04157325),
   ('excuse', 0.031506214),
   ('ensure', 0.028561708),
   ('screw', 0.027680824)]),
 (3,
  [('year', 0.035107005),
   ('good', 0.026380679),
   ('car', 0.019712321),
   ('high', 0.018115595),
   ('old', 0.017635448),
   ('will', 0.016281798),
   ('

In [62]:
topic_dict = {}
for i in lda_model.show_topics(formatted=False, num_topics=31):
    word_weight = {}
    for word in i[1]:
        word_weight[word[0]] = word[1]
    topic_dict[i[0]] = word_weight

Создаем функцию, достающую топики из текста.

In [97]:
def get_topic(text):
    dict_prob = {}
    for word in text:
        for i in range(len(topic_dict)):
            if word in topic_dict[i]:
                if i in dict_prob:
                    dict_prob[i] += topic_dict[i][word]
                else:
                    dict_prob[i] = topic_dict[i][word]
    if dict_prob != {}:
        topic = sorted(dict_prob.items(), key=lambda kv: kv[1], reverse=True)[0][0]
    else:
        topic = 'unknown'
    return topic

In [98]:
print(get_topic(['program', 'run']))

0


Применяем ее к целому столбцу нашего датафрейма.

In [118]:
df['topic'] = df['data_lemmatized'].apply(get_topic)

In [119]:
print(df)

                                                 content  target  \
0      From: lerxst@wam.umd.edu (where's my thing)\nS...       7   
1      From: guykuo@carson.u.washington.edu (Guy Kuo)...       4   
2      From: twillis@ec.ecn.purdue.edu (Thomas E Will...       4   
3      From: jgreen@amber (Joe Green)\nSubject: Re: W...       1   
4      From: jcm@head-cfa.harvard.edu (Jonathan McDow...      14   
...                                                  ...     ...   
11309  From: jim.zisfein@factory.com (Jim Zisfein) \n...      13   
11310  From: ebodin@pearl.tufts.edu\nSubject: Screen ...       4   
11311  From: westes@netcom.com (Will Estes)\nSubject:...       3   
11312  From: steve@hcrlgw (Steven Collins)\nSubject: ...       1   
11313  From: gunning@cco.caltech.edu (Kevin J. Gunnin...       8   

                   target_names  \
0                     rec.autos   
1         comp.sys.mac.hardware   
2         comp.sys.mac.hardware   
3                 comp.graphics   
4       

## ДАЛЕЕ ЧЕРНОВИК

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [158]:
d = df.groupby(['topic'])['data_lemmatized'].apply(lambda x: list(x.value_counts().index)).to_dict()
print(d[1])



In [137]:
for i in d.values():
    print(i)
    break



In [147]:
ind_words = []
for i in d.values():
    for b in i:
        for a in b:
            ind_words.append(a)
ind_words = list(set(ind_words))

In [152]:
def find_tfidf(dicti):
    for i in range(len(d)):
        texts_topic = [' '.join(a) for a in d[i]]
        vect = vectorizer.fit_transform(texts_topic)
        sents = []
        v = vect.toarray()
        for i in range(len(v)):
            ans = []
            for nums in v[i].argsort()[-5:]:
                ans.append(vectorizer.get_feature_names()[nums])
            sents.append(ans)
        

In [159]:
texts_topic = [''.join(a) for a in d[1]]
vect = vectorizer.fit_transform(texts_topic)
print(vect)

  (0, 497)	1.0
  (1, 4)	1.0
  (2, 38)	1.0
  (3, 192)	1.0
  (4, 976)	1.0
  (5, 514)	1.0
  (6, 678)	1.0
  (7, 280)	1.0
  (8, 693)	1.0
  (9, 13)	1.0
  (10, 541)	1.0
  (11, 113)	1.0
  (12, 34)	1.0
  (13, 464)	1.0
  (14, 668)	1.0
  (15, 289)	1.0
  (16, 229)	1.0
  (17, 764)	1.0
  (18, 706)	1.0
  (19, 891)	1.0
  (20, 753)	1.0
  (21, 61)	1.0
  (22, 178)	1.0
  (23, 216)	1.0
  (24, 167)	1.0
  :	:
  (988, 59)	1.0
  (989, 63)	1.0
  (990, 84)	1.0
  (991, 411)	1.0
  (992, 545)	1.0
  (993, 698)	1.0
  (994, 359)	1.0
  (995, 368)	1.0
  (996, 199)	1.0
  (997, 925)	1.0
  (998, 117)	1.0
  (999, 557)	1.0
  (1000, 101)	1.0
  (1001, 384)	1.0
  (1002, 906)	1.0
  (1003, 580)	1.0
  (1004, 195)	1.0
  (1005, 955)	1.0
  (1006, 654)	1.0
  (1007, 87)	1.0
  (1008, 516)	1.0
  (1009, 663)	1.0
  (1010, 913)	1.0
  (1011, 393)	1.0
  (1012, 502)	1.0


In [67]:
dicti = {}
dicti['bighr'] = -9
print(dicti)

{'bighr': -9}


In [68]:
if 'bighr' in dicti:
    print('f')

f
