# Topic Modeling Using Latent Dirichlet Allocation
#### LDA is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.

In [22]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [23]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
import gensim

## Step 1: Load the dataset
#### The dataset used is the 20newsgroup dataset that is available from sklearn. This dataset has news articles grouped into 20 news categories

In [25]:
from sklearn.datasets import fetch_20newsgroups

In [26]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

#### Topics of different news groups

In [27]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [28]:
type(newsgroups_train.data)

list

In [29]:
len(newsgroups_train.data)

11314

In [30]:
data=newsgroups_train.data

In [31]:
data[2]

'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swin

## Step 2: Data Preprocessing

### Perform the following steps:

#### Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
#### All stopwords are removed.
#### Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
#### Words are stemmed - words are reduced to their root form.

In [32]:
lemma=WordNetLemmatizer()

In [33]:
new_data=[]
for a in data:
    a=a.lower()
    text=re.sub('[^a-z]',' ',a)
    text=text.split(' ')
    text=[lemma.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    new_data.append(text)

In [34]:
print(len(new_data))

11314


In [35]:
new_data[2]

' twillis ec ecn purdue edu  thomas e willis  subject  pb question    organization  purdue university engineering computer network distribution  usa line      well folk  mac plus finally gave ghost weekend starting life    k way back        sooo  market new machine bit sooner intended     looking picking powerbook     maybe     bunch question  hopefully  somebody answer     anybody know dirt next round powerbook introduction expected   heard    c supposed make appearence  summer  heard anymore   since access macleak  wondering anybody info       anybody heard rumor price drop powerbook line like one duo went recently     impression display       could probably swing     got   mb disk rather      really feel much  better  display  yea  look great store   wow  really good     could solicit opinion people use         day day worth taking disk size money hit get active display    realize real subjective question  played around machine computer store breifly figured opinion somebody actuall

## Step 3: Bag of words on the dataset

In [36]:
vector=CountVectorizer()

In [37]:
X = vector.fit_transform(new_data)

In [38]:
X.shape

(11314, 82706)

#### Convert sparse matrix to gensim corpus.

In [39]:
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

#### Mapping from word IDs to words

In [40]:
id_map={}
for k,v in vector.vocabulary_.items():
    id_map[v]=k

## Step 4: Running LDA using Bag of Words

In [41]:
lda_model = gensim.models.LdaModel(corpus,id2word=id_map,num_topics=8,passes=15)

In [46]:
topics_words = lda_model.print_topics(num_topics=8,num_words=10)

In [47]:
for no,top in topics_words:
    print('topic',no)
    print(top)
    print('\n')

topic 0
0.014*"game" + 0.012*"team" + 0.008*"year" + 0.008*"player" + 0.007*"ca" + 0.006*"hockey" + 0.006*"play" + 0.005*"win" + 0.005*"season" + 0.004*"league"


topic 1
0.009*"line" + 0.008*"edu" + 0.007*"subject" + 0.007*"window" + 0.007*"file" + 0.007*"com" + 0.006*"organization" + 0.006*"use" + 0.005*"system" + 0.005*"drive"


topic 2
0.013*"space" + 0.012*"nasa" + 0.008*"armenian" + 0.008*"gov" + 0.005*"turkish" + 0.004*"center" + 0.003*"year" + 0.003*"moon" + 0.003*"earth" + 0.003*"orbit"


topic 3
0.008*"edu" + 0.006*"people" + 0.005*"would" + 0.005*"article" + 0.005*"writes" + 0.005*"subject" + 0.004*"organization" + 0.004*"state" + 0.004*"line" + 0.004*"one"


topic 4
0.016*"key" + 0.007*"chip" + 0.007*"one" + 0.006*"com" + 0.006*"encryption" + 0.006*"clipper" + 0.005*"would" + 0.005*"government" + 0.004*"system" + 0.004*"use"


topic 5
0.008*"one" + 0.008*"god" + 0.008*"people" + 0.008*"would" + 0.006*"edu" + 0.005*"think" + 0.005*"say" + 0.004*"gun" + 0.004*"com" + 0.004*"s

### Classification of the topics
#### Using the words in each topic and their categories

* Sports
* Computer hardware
* Space
* Encyption
* Relegion



## Step 6: Testing model on unseen document

In [102]:
num=20
unseen_document = newsgroups_test.data[num]
print(unseen_document)

From: Matjaz.Gams@IJS.si
Subject: ``Call for Papers for INFORMATIKA'' Vol. 17 No. 2
Originator: abrodnik@watdragon.uwaterloo.ca
Organization: Jozef Stefan Institute, Ljubljana, Slovenia
Lines: 69

This is an invitation to send articles to the Informatica magazine.
The first fully international issue has been published and echoes 
are quite favourable. For any information, contact (matjaz.gams@ijs.si). 

Dear Colleague,                                        April 25, 1993

Number 1 of volume 17 of Informatica is now out of print and some of you 
will receive it in a week or so. As you will see, the journal is structured
in the following way: the editorial (first page); profiles (second page
-- biography of an editor, in this issue, Terry Winograd); the edited
part of papers (pp. 3-80); mission and research reports (A plan for
knowledge archives project in Japan and CSLI in Stanford, pp. 81-100);
and news and announcements (pp. 101-108). This structure is mentioned to
give you a suggest

In [103]:
print(newsgroups_test.target[num])

1


In [104]:
list(newsgroups_test.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [105]:
def clean_text(a):
    a=a.lower()
    text=re.sub('[^a-z]',' ',a)
    text=text.split(' ')
    text=[lemma.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    return text

In [106]:
unseen_document=clean_text(unseen_document)
unseen_document

' matjaz gam ijs si subject    call paper informatika   vol        originator  abrodnik watdragon uwaterloo ca organization  jozef stefan institute  ljubljana  slovenia line      invitation send article informatica magazine  first fully international issue published echo  quite favourable  information  contact  matjaz gam ijs si     dear colleague                                         april           number   volume    informatica print  receive week  see  journal structured following way  editorial  first page   profile  second page    biography editor  issue  terry winograd   edited part paper  pp         mission research report  plan knowledge archive project japan csli stanford  pp           news announcement  pp            structure mentioned give suggestion could help make content journal significant  diverse  interesting  bringing view discourse     great emphasis given called editorial page  page express opinion  belief  writing editor problem within scope computing informati

In [107]:
y = vector.transform([unseen_document])
y_corpus = gensim.matutils.Sparse2Corpus(y, documents_columns=False)
prediction = lda_model.get_document_topics(y_corpus)
list(sorted(prediction))

[[(1, 0.3165695),
  (2, 0.17115526),
  (3, 0.24209571),
  (4, 0.034918953),
  (5, 0.2257816)]]

#### The model correctly classifies the unseen document with 'x'% probability to the X category.