# Other Tools: Gensim

```yaml
Course:   DS 5001
Module:   08a Visualization
Topic:    Other Tools
Author:   R.C. Alvarado
Date:     23 March 2023
```

## Set Up

### Config

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [6]:
local_lib

'/Users/Samantha/Desktop/MSDS/DS5001/repo/lessons/lib'

In [8]:
num_topics = 100
data_dir = f"{data_home}/newsgroups/20news-18828"

### Imports

In [6]:
import pandas as pd
import numpy as np
from gensim import corpora, models
from collections import defaultdict
import plotly_express as px
from glob import glob
import re 

## Import Data

In [7]:
def import_data():
    data = []
    for d in glob(data_dir+"/*"):
        label = d.split("/")[-1]
        print(label)
        for f in glob(d+"/*"):
            fid = f.split("/")[-1]
            flines = open(f, 'r', encoding="latin-1").read().split("\n")
            from_line = ':'.join(flines[0].split(':')[1:])
            subj_line = ':'.join(flines[1].split(':')[1:])
            data.append((fid, label, from_line, subj_line, ' '.join(flines[2:])))
    LIB = pd.DataFrame(data, columns=['doc_id','doc_label','doc_from', 'doc_subj', 'doc_content'])
    LIB.doc_id = LIB.doc_id.astype('int')
    LIB = LIB.set_index(['doc_label','doc_id'])
    return LIB

In [8]:
LIB = import_data()

talk.politics.mideast
rec.autos
comp.sys.mac.hardware
alt.atheism
rec.sport.baseball
comp.os.ms-windows.misc
rec.sport.hockey
sci.crypt
sci.med
talk.politics.misc
rec.motorcycles
comp.windows.x
comp.graphics
comp.sys.ibm.pc.hardware
sci.electronics
talk.politics.guns
sci.space
soc.religion.christian
misc.forsale
talk.religion.misc


In [9]:
LIB

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_from,doc_subj,doc_content
doc_label,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
talk.politics.mideast,75895,hm@cs.brown.edu (Harry Mamaysky),Heil Hernlem,In article <1993Apr14.125813.21737@ncsu.edu> ...
talk.politics.mideast,76248,waldo@cybernet.cse.fau.edu (Todd J. Dicker),Re: Israel's Expansion II,"ab4z@Virginia.EDU (""Andi Beyer"") writes: > F..."
talk.politics.mideast,76277,C.L.Gannon@newcastle.ac.uk (Space Cadet),"Re: To be exact, 2.5 million readers enlighte...",Andrew Varvel writes: > > > Serdar Argic >...
talk.politics.mideast,76045,shaig@Think.COM (Shai Guday),"Basil, opinions? (Re: Water on the brain)",In article <1993Apr15.204930.9517@thunder.mcr...
talk.politics.mideast,77197,ez000281@hamlet.ucdavis.edu (),Re: The Stage is Being Set,Srinivas Suder writes: >If the Haitian peopl...
...,...,...,...,...
talk.religion.misc,83934,porta@wam.umd.edu (David Palmer),Re: 14 Apr 93 God's Promise in 1 John 1: 7,In article <1qknu0INNbhv@shelley.u.washington...
talk.religion.misc,82812,decay@cbnewsj.cb.att.com (dean.kaflowitz),Re: Spreading Christianity (Re: Christian Ext...,"In article <C51puA.K2u@mailer.cc.fsu.edu>, dl..."
talk.religion.misc,84127,ekr@kyle.eitech.com (Eric Rescorla),"Re: What part of ""No"" don't you understand?",In article <1993Apr24.214843.10940@midway.uch...
talk.religion.misc,84315,"""David R. Sacco"" <dsav+@andrew.cmu.edu>",Re: ABORTION and private health coverage -- l...,On 21-Apr-93 in Re: ABORTION and private he.....


In [10]:
LIB.to_csv(f"{output_dir}/newsgroups-LIB.csv")

## Pre-Process the Gensim Way

### Stopwords

We create a set of frequent words. Of course, we can grab a premade list from somewhere else, such as NLTK.

In [11]:
stoplist = set('for a of the and to in is i that it you this be on are'.split(' '))

### Corpus

We loop through the list of docs and do some parsing and shaping on the fly. 

Again, we could do better with tools from NLTK.

Here we lowercase each document, split it by white space, remove non-alphanumeric characters, and filter out stopwords

In [12]:
texts = [[re.sub(r"[\W_]+", "", word) for word in document.lower().split() if word not in stoplist]
         for document in LIB.doc_content.values]

### Term Frequencies

We count word frequencies in order to filter out low-frequency words.

In [13]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

### Filtered Corpus

We filter by frequency, removing words that appear once.

In [14]:
filtered_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

### Dictionary

We create a "dictionary," aka a vocabulary, which associates a term string with a numeric identifier.

In [15]:
dictionary = corpora.Dictionary(filtered_corpus)

### BOW

We create the BOW corpus from the text using the dictionary.

In [16]:
bow_corpus = [dictionary.doc2bow(text) for text in filtered_corpus]

In [None]:
# bow_corpus[0]

## Train models

### TFIDF

In [17]:
tfidf = models.TfidfModel(bow_corpus)

In [None]:
# tfidf[bow_corpus[5]]

### LDA

In [18]:
model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics)

In [19]:
model2 = models.HdpModel(bow_corpus, id2word=dictionary)

## Convert to Pandas

### VOCAB

In [20]:
VOCAB = pd.DataFrame([(k, v) for k, v in dictionary.token2id.items()], columns=['term_str','term_id']) #.set_index('term_id')
VOCAB['n'] = VOCAB.term_str.map(lambda x: frequency[x])
VOCAB = VOCAB.set_index('term_id').sort_index()

In [21]:
VOCAB.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
39942,asci,4
78664,embellished,2
51417,amitriptyline,3
49134,scratchpad,2
55095,canadafrancegermany,2


### TFIDF

In [22]:
tfidf_data = []
for doc_id, doc in enumerate(bow_corpus):
    for term in tfidf[doc]:
        tfidf_data.append((doc_id, term[0], term[1]))
TFIDF = pd.DataFrame(tfidf_data, columns=['doc_id','term_id', 'tfidf']).set_index(['doc_id','term_id'])

In [23]:
TFIDF.tfidf.unstack(fill_value=0)

term_id,0,1,2,3,4,5,6,7,8,9,...,79154,79155,79156,79157,79158,79159,79160,79161,79162,79163
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.121893,0.042943,0.014431,0.066946,0.041293,0.013847,0.013055,0.054541,0.064667,0.011687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.039125,0.000000,0.000000,0.056313,0.035394,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.012670,0.000000,0.000000,0.000000,0.021386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.031419,0.014811,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.000000,0.000000,0.020391,0.000000,0.000000,0.000000,0.018447,0.000000,0.000000,0.049539,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18824,0.000000,0.000000,0.031593,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18825,0.000000,0.000000,0.000000,0.000000,0.000000,0.012472,0.000000,0.000000,0.000000,0.031579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18826,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### BOW

In [24]:
bow_data = []
for i, doc in enumerate(bow_corpus):
    for term in doc:
        bow_data.append((i, term[0], term[1]))
BOW = pd.DataFrame(bow_data, columns=['doc_id','term_id', 'n']).set_index(['doc_id','term_id'])     
DTM = BOW.n.unstack(fill_value=0)

In [25]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n
doc_id,term_id,Unnamed: 2_level_1
0,0,1
0,1,1
0,2,1
0,3,1
0,4,1


In [26]:
DTM.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,79154,79155,79156,79157,79158,79159,79160,79161,79162,79163
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,2,0,0,3,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### LDA

#### PHI

In [27]:
PHI = pd.DataFrame(model.get_topics()).T
PHI.index.name = 'term_id'

In [28]:
PHI

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000003,1.887274e-07,0.000004,0.000004,3.526690e-08,0.000002,0.000002,0.000005,3.677251e-07,0.000003,...,2.922518e-08,0.000002,4.237303e-07,0.000006,0.000002,0.000004,0.000002,0.000002,0.000005,3.623229e-07
1,0.000058,4.461062e-05,0.000031,0.000014,1.829915e-05,0.000010,0.000579,0.001338,3.122375e-03,0.000068,...,1.444699e-04,0.000463,4.398554e-04,0.000027,0.000045,0.000036,0.000013,0.000068,0.000032,3.867033e-04
2,0.000519,2.493649e-03,0.001028,0.000350,4.622725e-03,0.001245,0.000545,0.000712,1.131792e-03,0.000995,...,6.116238e-03,0.001270,3.416304e-03,0.000480,0.002007,0.002921,0.000937,0.003250,0.000766,1.832998e-03
3,0.000006,3.468364e-07,0.000020,0.000006,1.218944e-04,0.000005,0.000006,0.000005,3.865761e-07,0.000007,...,3.536138e-05,0.000003,2.528780e-06,0.000007,0.000004,0.000005,0.000003,0.000002,0.000006,4.313731e-07
4,0.000119,8.735790e-04,0.000026,0.000021,1.046181e-04,0.000066,0.000015,0.000012,1.335262e-05,0.000086,...,8.001481e-04,0.000211,2.801424e-04,0.000024,0.000275,0.000243,0.000111,0.000203,0.000027,3.249556e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79159,0.000003,1.887274e-07,0.000004,0.000004,3.400639e-08,0.000002,0.000002,0.000005,3.677251e-07,0.000003,...,2.831140e-08,0.000002,4.237303e-07,0.000006,0.000002,0.000004,0.000002,0.000002,0.000005,3.623229e-07
79160,0.000003,1.887274e-07,0.000004,0.000004,3.400639e-08,0.000002,0.000002,0.000005,3.677251e-07,0.000003,...,2.831140e-08,0.000002,4.237303e-07,0.000006,0.000002,0.000004,0.000002,0.000002,0.000005,3.623229e-07
79161,0.000003,1.887274e-07,0.000004,0.000004,3.400639e-08,0.000002,0.000002,0.000005,3.677251e-07,0.000003,...,2.831140e-08,0.000002,4.237303e-07,0.000006,0.000002,0.000004,0.000002,0.000002,0.000005,3.623229e-07
79162,0.000003,1.887274e-07,0.000004,0.000004,3.400639e-08,0.000002,0.000002,0.000005,3.677251e-07,0.000003,...,2.831140e-08,0.000002,4.237303e-07,0.000006,0.000002,0.000004,0.000002,0.000002,0.000005,3.623229e-07


#### THETA

In [29]:
theta_data = []
for doc_id, doc_bow in enumerate(bow_corpus):
    for topic in model.get_document_topics(doc_bow):
        theta_data.append((doc_id, topic[0], topic[1]))
THETA = pd.DataFrame(theta_data, columns=['doc_id', 'topic_id', 'topic_weight']).set_index(['doc_id','topic_id']).unstack(fill_value=0)

In [30]:
THETA

Unnamed: 0_level_0,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight
topic_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doc_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.0,0.0,0.065951,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.010672,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.083828,0.0,0.0,0.0,0.0,0.0,...,0.119194,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.015553,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.125811,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.0,0.0,0.0,0.0,0.217310,0.0,0.0,0.0,0.0,0.0,...,0.130745,0.000000,0.033712,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
18824,0.0,0.0,0.0,0.0,0.205028,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
18825,0.0,0.0,0.0,0.0,0.110122,0.0,0.0,0.0,0.0,0.0,...,0.273527,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
18826,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


#### TOPIC

In [31]:
topic_data = []
for t in range(num_topics):
    for term_rank, term in enumerate(model.get_topic_terms(t)):
        term_id = term[0]
        topic_data.append((t, term_rank, dictionary.id2token[term_id]))

In [32]:
TOPIC = pd.DataFrame(topic_data, columns=['topic_id', 'term_rank', 'term_str'])\
    .set_index(['topic_id','term_rank']).term_str.unstack()

In [33]:
TOPIC.head(20)

term_rank,0,1,2,3,4,5,6,7,8,9
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,drives,san,connector,angels,recording,portable,drive,heads,motherboard,bradley
1,have,with,version,drive,my,or,if,hard,disk,but
2,dawn,login,dumb,lim,pounds,1959,justin,clintons,,combine
3,values,quantum,instructions,royal,conner,morals,fills,indirect,gentleman,broadcast
4,god,not,who,he,as,by,his,,all,with
5,sex,marriage,priest,sexual,married,faithful,drugs,wedding,reckon,reflects
6,p,250,350,175,275,150,120,69,52,118
7,hi,los,sunday,angeles,beam,las,temporary,cal,fuse,june
8,card,graphics,with,pc,color,monitor,video,ram,have,controller
9,hanging,batteries,pain,static,purchased,isaac,adapter,eugene,smiths,hint
