In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt#plotting libraray for the python programming language
#and its numerical mathematics extension NumPy
import nltk

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
pd.options.display.max_colwidth-200
%matplotlib inline
#sets the backend of matplotlib to the inline backend

In [4]:
#building a corpus of documents
corpus=['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       "A king's breakfast has sausages, ham, bacon, eggs, toast, beans",
       'I love green eggs, ham, sausages and bacon!',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!']

In [5]:
labels=['weather', 'weather', 'animals', 'food', 'food','animals', 'weather','animals']

In [6]:
corpus=np.array(corpus)

In [7]:
corpus

array(['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       "A king's breakfast has sausages, ham, bacon, eggs, toast, beans",
       'I love green eggs, ham, sausages and bacon!',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!'], dtype='<U63')

In [8]:
#unicode string-U https://docs.scipy.org/doc/numpy-1.15.4/reference/arrays.dtypes.html

In [11]:
corpus_df=pd.DataFrame({'Document':corpus,'Category':labels})

In [12]:
corpus_df=corpus_df[['Document','Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beaut...,weather
7,The dog is lazy but the brown fox is quick!,animals


In [14]:
#preprocessing text corpus

In [17]:
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')

In [22]:
def normalize_document(doc):
  #lowercase and remove special characters\whitespace
  doc=re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)#re.I ignore case sensitive, ASCII-only matching
  doc=doc.lower()
  doc=doc.strip()
  #tokenize document
  tokens=wpt.tokenize(doc)
  #filter stopwords out of document
  filtered_tokens=[token for token in tokens if token not in stop_words]
  #re-create documenr from filtered tokens
  doc=' '.join(filtered_tokens)
  return doc

In [23]:
normalize_corpus=np.vectorize(normalize_document)#Basic preprocessing pipeline is ready, we apply our sample corpus

In [24]:
norm_corpus=normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today',
       'dog lazy brown fox quick'], dtype='<U51')

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
#get bag of words features in sparse format(mostly zeros)
cv=CountVectorizer(min_df=0., max_df=1.)

In [27]:
cv_matrix=cv.fit_transform(norm_corpus)
cv_matrix

<8x20 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [28]:
cv_matrix=cv_matrix.toarray()

In [29]:
vocab=cv.get_feature_names_out()

In [30]:
#topic model=LDA model
from sklearn.decomposition import LatentDirichletAllocation
Lda=LatentDirichletAllocation(n_components=3, max_iter=10000,random_state=0)
dt_matrix=Lda.fit_transform(cv_matrix)
features=pd.DataFrame(dt_matrix,columns=['T1','T2','T3'])
features

Unnamed: 0,T1,T2,T3
0,0.832191,0.08348,0.084329
1,0.863554,0.0691,0.067346
2,0.047794,0.047776,0.90443
3,0.037243,0.925559,0.037198
4,0.049121,0.903076,0.047802
5,0.054902,0.047778,0.897321
6,0.888287,0.055697,0.056016
7,0.055704,0.055689,0.888607


In [33]:
#review topic constituents
tt_matrix=Lda.components_
for topic_weights in tt_matrix:
  topic=[(token,weight)for token,weight in zip(vocab,topic_weights)]
  topic=sorted(topic,key=lambda x:-x[1])
  topic=[item for item in topic if item[1]>0.60]
  print(topic)
  print()

[('sky', 4.332439442470133), ('blue', 3.373774254787669), ('beautiful', 3.3323650509884386), ('today', 1.3325579855138985), ('love', 1.330415818217548)]

[('bacon', 2.33269586574902), ('eggs', 2.33269586574902), ('ham', 2.33269586574902), ('sausages', 2.33269586574902), ('love', 1.3354610533796558), ('beans', 1.3327735190105536), ('breakfast', 1.3327735190105536), ('kings', 1.3327735190105536), ('toast', 1.3327735190105536), ('green', 1.3325431515674175)]

[('brown', 3.3323473548404405), ('dog', 3.3323473548404405), ('fox', 3.3323473548404405), ('lazy', 3.3323473548404405), ('quick', 3.3323473548404405), ('jumps', 1.3324193772908193), ('blue', 1.2919423137963386)]

