In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import gensim

## Step 1: Load the dataset
#### The dataset used is the 20newsgroup dataset that is available from sklearn. This dataset has news articles grouped into 20 news categories

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

#### Topics of different news groups

In [5]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
data=newsgroups_train.data

In [7]:
data[2]

'From: twillis@ec.ecn.purdue.edu (Thomas E Willis)\nSubject: PB questions...\nOrganization: Purdue University Engineering Computer Network\nDistribution: usa\nLines: 36\n\nwell folks, my mac plus finally gave up the ghost this weekend after\nstarting life as a 512k way back in 1985.  sooo, i\'m in the market for a\nnew machine a bit sooner than i intended to be...\n\ni\'m looking into picking up a powerbook 160 or maybe 180 and have a bunch\nof questions that (hopefully) somebody can answer:\n\n* does anybody know any dirt on when the next round of powerbook\nintroductions are expected?  i\'d heard the 185c was supposed to make an\nappearence "this summer" but haven\'t heard anymore on it - and since i\ndon\'t have access to macleak, i was wondering if anybody out there had\nmore info...\n\n* has anybody heard rumors about price drops to the powerbook line like the\nones the duo\'s just went through recently?\n\n* what\'s the impression of the display on the 180?  i could probably swin

## Step 2: Data Preprocessing

### Perform the following steps:

#### Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
#### All stopwords are removed.
#### Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
#### Words are stemmed - words are reduced to their root form.

In [8]:
lemma=WordNetLemmatizer()

In [9]:
new_data=[]
for a in data:
    a=a.lower()
    text=re.sub('[^a-z0-9]',' ',a)
    text=text.split(' ')
    text=[lemma.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    new_data.append(text)

In [10]:
new_data[2]

' twillis ec ecn purdue edu  thomas e willis  subject  pb question    organization  purdue university engineering computer network distribution  usa line  36  well folk  mac plus finally gave ghost weekend starting life 512k way back 1985   sooo  market new machine bit sooner intended     looking picking powerbook 160 maybe 180 bunch question  hopefully  somebody answer     anybody know dirt next round powerbook introduction expected   heard 185c supposed make appearence  summer  heard anymore   since access macleak  wondering anybody info       anybody heard rumor price drop powerbook line like one duo went recently     impression display 180   could probably swing 180 got 80mb disk rather 120  really feel much  better  display  yea  look great store   wow  really good     could solicit opinion people use 160 180 day day worth taking disk size money hit get active display    realize real subjective question  played around machine computer store breifly figured opinion somebody actuall

## Step 3: Bag of words on the dataset

In [13]:
vector=CountVectorizer()

In [14]:
X = vector.fit_transform(new_data)

In [15]:
X.shape

(11314, 120169)

#### Convert sparse matrix to gensim corpus.

In [17]:
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

#### Mapping from word IDs to words

In [18]:
id_map={}
for k,v in vector.vocabulary_.items():
    id_map[v]=k

## Step 4: Running LDA using Bag of Words

#### We have 2 LDA models, I am using LDA Multicore to process text faster.

#### Code for LDA single core
##### lda_model = gensim.models.LdaModel(corpus, id2word=id_map, num_topics=10, passes=25)



In [None]:
lda_model = gensim.models.LdaMulticore(corpus,id2word=id_map,num_topics=10,passes=10,workers=2)