Topic Modeling

In [69]:
import pandas as pd
import numpy as np
import pickle


In [70]:
#Import Data
data = pd.read_csv('air-pollution-disease.csv', error_bad_lines=False, delimiter='    ')
#document = data['text']

  


In [71]:
textData = data['Tweet text']

In [72]:
# Apply a first round of text cleaning techniques
import re
import string
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
    text = re.sub('â€™', '', text)
    text = re.sub('œ', '', text)
    return text

def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return " ".join(str(x) for x in result)

In [73]:
textData = [clean_text_round1(item) for item in textData]
textData = [clean_text_round2(item) for item in textData]
textData = [preprocess(item) for item in textData]
textData = pd.DataFrame(textData)
textData.to_pickle("clean_data.pkl")

In [74]:
textData[0]

0             familiar negat effect pollut know damag eye
1       rebuild neighbourhood layout act solar calenda...
2       filter protect brain pollut learn filter abil ...
3                    shortterm effect pollut bloodpressur
4       background cool paper europ general germani pa...
                              ...                        
2022    octob issu look pollut effect brain think orig...
2023    intern preval chemic sensit copreval asthma au...
2024    cite report show mortal effect youll know poll...
2025    diolch fawr iawn mcgarri ddod trafod llygredd ...
2026     boost immun fight effect pollut onion help write
Name: 0, Length: 2027, dtype: object

In [75]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(textData[0])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
#data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaar,aaron,aasima,abbott,abdab,abeyta,abido,abil,abl,ablat,...,î²caroten,ðÿ²ðÿ,ðÿš²bus,ðÿšðÿ,ðÿžmental,ðÿžðÿ,ùƒøªø,ùƒùø,ûøªù,ƒðÿš
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_dtm.to_pickle("data_dtm.pkl")

### Topic Modeling -1

In [77]:
from gensim import matutils, models
import scipy.sparse

In [78]:
# One of the required inputs is a term-document matrix
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026
aaar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aasima,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abbott,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abdab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [80]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [82]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.053*"effect" + 0.044*"pollut" + 0.017*"airpollut" + 0.013*"health" + 0.012*"caus" + 0.010*"studi" + 0.009*"know" + 0.008*"level" + 0.008*"harm" + 0.008*"death"'),
 (1,
  '0.066*"effect" + 0.060*"pollut" + 0.033*"health" + 0.016*"airpollut" + 0.015*"peopl" + 0.013*"harm" + 0.012*"function" + 0.010*"public" + 0.010*"brain" + 0.009*"cognit"')]

In [83]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=100)
lda.print_topics()

[(0,
  '0.059*"pollut" + 0.058*"effect" + 0.017*"peopl" + 0.017*"airpollut" + 0.015*"function" + 0.013*"harm" + 0.012*"cognit" + 0.011*"studi" + 0.010*"damag" + 0.010*"long"'),
 (1,
  '0.061*"effect" + 0.050*"pollut" + 0.035*"health" + 0.016*"airpollut" + 0.009*"public" + 0.009*"caus" + 0.009*"harm" + 0.008*"need" + 0.007*"issu" + 0.007*"climat"')]

### Topic Modeling 2

In [84]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [89]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishiraj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [91]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rishiraj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [85]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('clean_data.pkl')
data_clean

Unnamed: 0,0
0,familiar negat effect pollut know damag eye
1,rebuild neighbourhood layout act solar calenda...
2,filter protect brain pollut learn filter abil ...
3,shortterm effect pollut bloodpressur
4,background cool paper europ general germani pa...
...,...
2022,octob issu look pollut effect brain think orig...
2023,intern preval chemic sensit copreval asthma au...
2024,cite report show mortal effect youll know poll...
2025,diolch fawr iawn mcgarri ddod trafod llygredd ...


In [92]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean[0].apply(nouns))
data_nouns

Unnamed: 0,0
0,effect pollut damag eye
1,neighbourhood act calendar effect equinox larg...
2,filter protect brain pollut filter abil preser...
3,effect pollut bloodpressur
4,background paper europ germani surpris pollut ...
...,...
2022,look effect brain parkinson diseas
2023,sensit copreval autism effect fragranc consum ...
2024,report show effect youll pollut death death ce...
2025,diolch fawr iawn mcgarri ddod trafod llygredd ...


In [93]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer


cvn = CountVectorizer()
data_cvn = cvn.fit_transform(data_nouns[0])
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaar,aaron,aasima,abdab,abido,abil,abl,abnorm,abort,abound,...,ªuae,ê³µì,ðÿ²ðÿ,ðÿš²bus,ðÿšðÿ,ðÿžmental,ðÿžðÿ,ùƒùø,ûøªù,ƒðÿš
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [95]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.093*"effect" + 0.079*"pollut" + 0.060*"health" + 0.017*"function" + 0.016*"peopl" + 0.015*"airpollut" + 0.014*"brain" + 0.013*"cognit" + 0.011*"harm" + 0.011*"children"'),
 (1,
  '0.074*"effect" + 0.042*"pollut" + 0.021*"harm" + 0.015*"airpollut" + 0.012*"heart" + 0.011*"death" + 0.010*"year" + 0.010*"work" + 0.010*"cancer" + 0.010*"action"')]

In [97]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.067*"effect" + 0.052*"pollut" + 0.032*"health" + 0.028*"work" + 0.023*"mcgarri" + 0.019*"studi" + 0.016*"today" + 0.016*"help" + 0.015*"fuel" + 0.014*"thank"'),
 (1,
  '0.068*"effect" + 0.049*"health" + 0.034*"pollut" + 0.026*"climat" + 0.017*"autism" + 0.017*"chang" + 0.013*"includ" + 0.011*"danger" + 0.011*"action" + 0.011*"heat"'),
 (2,
  '0.107*"effect" + 0.091*"pollut" + 0.032*"health" + 0.032*"function" + 0.029*"harm" + 0.027*"brain" + 0.025*"cognit" + 0.023*"peopl" + 0.023*"airpollut" + 0.020*"children"'),
 (3,
  '0.083*"effect" + 0.061*"pollut" + 0.029*"health" + 0.023*"airpollut" + 0.019*"harm" + 0.017*"condit" + 0.015*"heart" + 0.015*"death" + 0.014*"level" + 0.014*"year"')]

### Topic Modeling 3

In [98]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [100]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean[0].apply(nouns_adj))
data_nouns_adj

Unnamed: 0,0
0,familiar negat effect pollut damag eye
1,neighbourhood act solar calendar light effect ...
2,filter protect brain pollut filter abil preser...
3,shortterm effect pollut bloodpressur
4,background cool paper europ general germani pa...
...,...
2022,octob issu look pollut effect brain parkinson ...
2023,intern preval chemic sensit copreval asthma au...
2024,cite report show mortal effect youll pollut ca...
2025,diolch fawr iawn mcgarri ddod trafod llygredd ...


In [102]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj[0])
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
#data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaar,aaron,aasima,abdab,abeyta,abido,abil,abl,ablat,abnorm,...,î²caroten,ðÿ²ðÿ,ðÿš²bus,ðÿšðÿ,ðÿžmental,ðÿžðÿ,ùƒøªø,ùƒùø,ûøªù,ƒðÿš
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [104]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.034*"pollut" + 0.016*"airpollut" + 0.013*"harm" + 0.011*"studi" + 0.011*"heart" + 0.011*"death" + 0.010*"risk" + 0.009*"work" + 0.009*"cancer" + 0.009*"lung"'),
 (1,
  '0.077*"pollut" + 0.046*"health" + 0.015*"airpollut" + 0.015*"peopl" + 0.013*"harm" + 0.011*"function" + 0.011*"cognit" + 0.010*"caus" + 0.010*"brain" + 0.009*"damag"')]

In [105]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.055*"pollut" + 0.049*"health" + 0.024*"public" + 0.022*"climat" + 0.015*"peopl" + 0.015*"issu" + 0.013*"term" + 0.012*"long" + 0.012*"medic" + 0.011*"coal"'),
 (1,
  '0.081*"pollut" + 0.038*"function" + 0.038*"cognit" + 0.032*"peopl" + 0.030*"damag" + 0.029*"studi" + 0.028*"harm" + 0.024*"year" + 0.020*"work" + 0.019*"brain"'),
 (2,
  '0.072*"pollut" + 0.034*"health" + 0.018*"children" + 0.017*"reduc" + 0.017*"exposur" + 0.017*"condit" + 0.017*"airpollut" + 0.015*"advers" + 0.013*"harm" + 0.011*"awar"'),
 (3,
  '0.049*"pollut" + 0.028*"health" + 0.024*"harm" + 0.023*"caus" + 0.023*"airpollut" + 0.018*"level" + 0.017*"death" + 0.013*"mortal" + 0.013*"exercis" + 0.010*"heart"'),
 (4,
  '0.042*"pollut" + 0.028*"autism" + 0.019*"asthma" + 0.019*"airqual" + 0.018*"airpollut" + 0.017*"brain" + 0.015*"lack" + 0.015*"consum" + 0.015*"function" + 0.014*"product"')]

In [106]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=100)
ldana.print_topics()

[(0,
  '0.041*"health" + 0.040*"pollut" + 0.022*"issu" + 0.020*"airpollut" + 0.018*"climat" + 0.014*"chemic" + 0.013*"peopl" + 0.013*"autism" + 0.013*"diseas" + 0.012*"impact"'),
 (1,
  '0.056*"pollut" + 0.026*"health" + 0.022*"reduc" + 0.018*"exposur" + 0.014*"airpollut" + 0.010*"outdoor" + 0.009*"person" + 0.009*"condit" + 0.009*"intervent" + 0.008*"cochran"'),
 (2,
  '0.068*"pollut" + 0.026*"airpollut" + 0.024*"health" + 0.020*"lung" + 0.020*"risk" + 0.019*"public" + 0.019*"function" + 0.018*"brain" + 0.018*"caus" + 0.016*"coal"'),
 (3,
  '0.047*"pollut" + 0.022*"peopl" + 0.020*"health" + 0.020*"cognit" + 0.019*"function" + 0.019*"harm" + 0.015*"children" + 0.014*"public" + 0.014*"studi" + 0.014*"work"'),
 (4,
  '0.090*"pollut" + 0.034*"health" + 0.026*"caus" + 0.016*"level" + 0.015*"harm" + 0.015*"term" + 0.012*"exercis" + 0.011*"airpollut" + 0.010*"advers" + 0.009*"evid"')]