In [1]:
import pandas as pd
import os, json
from unidecode import unidecode

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from nltk.stem import *
from nltk.stem import PorterStemmer

In [2]:
def getBeautifyText(text):
     return unidecode(text.replace('\n', ' '))

In [3]:
# Get only question
# Get all json file from data directory
path_to_json = 'data/'
columns = ['question']
questionAndAnswerDf = pd.DataFrame(columns=columns)
indexCount = 0
for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json'):
        with open(path_to_json + pos_json, encoding="utf8") as json_file:
            data = json.load(json_file)
            for index, questionObject in enumerate(data):
                questionAndAnswerDf.loc[indexCount] = [getBeautifyText(questionObject["question"])]
                indexCount += 1

In [4]:
questionAndAnswerDf.head()

Unnamed: 0,question
0,How do you compare Methane Ppm and carbon(iv) ...
1,Equatorial Bulge and its implications on the c...
2,Where oil is used for [closed]
3,Rain Water vs Sprinkler Irrigation
4,Climate Change in the North


In [5]:
print(len(questionAndAnswerDf))

3233


In [6]:
# lemmatize text 
def lemmatize_stemming(text):
    stemmer=PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [7]:
# text processing 
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
# apply texgt pre processing (1st question)
questionSample = questionAndAnswerDf.iloc[0]['question']

print('Original question: ')
words = []
for word in questionSample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(questionSample))

Original question: 
['How', 'do', 'you', 'compare', 'Methane', 'Ppm', 'and', 'carbon(iv)', 'oxide', '[closed]']


 tokenized and lemmatized document: 
['compar', 'methan', 'carbon', 'oxid', 'close']


In [9]:
# create one process dataset 
processDf = questionAndAnswerDf['question'].map(preprocess)
processDf.head()

0    [compar, methan, carbon, oxid, close]
1         [equatori, bulg, implic, climat]
2                                  [close]
3          [rain, water, sprinkler, irrig]
4                   [climat, chang, north]
Name: question, dtype: object

In [10]:
# apply beg of words
dictionary = gensim.corpora.Dictionary(processDf)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 carbon
1 close
2 compar
3 methan
4 oxid
5 bulg
6 climat
7 equatori
8 implic
9 irrig
10 rain


In [11]:
# filter out tokens that appear in 
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
# how many words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processDf]

In [15]:
bow_corpus[100]

[(73, 1), (82, 1), (89, 1)]

In [16]:
# Preview Bag Of Words for our sample preprocessed document
bow_doc_100 = bow_corpus[100]
for i in range(len(bow_doc_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_100[i][0], 
                                               dictionary[bow_doc_100[i][0]], 
bow_doc_100[i][1]))

Word 73 ("area") appears 1 time.
Word 82 ("hotter") appears 1 time.
Word 89 ("winter") appears 1 time.


In [17]:
# Create tf-idf model object using models
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.4923813985714881), (1, 0.5224031771053126), (2, 0.6961720181758678)]


In [18]:
# Running LDA using Bag of Words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [19]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.100*"climat" + 0.083*"chang" + 0.079*"ocean" + 0.060*"heat" + 0.035*"go" + 0.034*"planet" + 0.029*"pollut" + 0.023*"life" + 0.023*"stop" + 0.023*"fight"
Topic: 1 
Words: 0.073*"earth" + 0.051*"climat" + 0.038*"chang" + 0.037*"emiss" + 0.037*"problem" + 0.035*"human" + 0.031*"ipcc" + 0.029*"arctic" + 0.027*"experi" + 0.026*"carbon"
Topic: 2 
Words: 0.218*"climat" + 0.135*"chang" + 0.039*"model" + 0.036*"data" + 0.035*"temperatur" + 0.031*"increas" + 0.028*"rise" + 0.027*"level" + 0.023*"atmospher" + 0.022*"global"
Topic: 3 
Words: 0.119*"climat" + 0.094*"temperatur" + 0.048*"year" + 0.043*"averag" + 0.039*"earth" + 0.034*"chang" + 0.023*"global" + 0.022*"feedback" + 0.018*"model" + 0.017*"question"
Topic: 4 
Words: 0.172*"climat" + 0.125*"chang" + 0.067*"world" + 0.032*"scientist" + 0.030*"water" + 0.026*"differ" + 0.024*"impact" + 0.021*"school" + 0.019*"look" + 0.019*"strike"
Topic: 5 
Words: 0.109*"greenhous" + 0.102*"effect" + 0.044*"close" + 0.040*"time" + 0.039*

In [20]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.097*"global" + 0.089*"warm" + 0.053*"rise" + 0.047*"caus" + 0.045*"climat" + 0.043*"level" + 0.039*"scientist" + 0.034*"chang" + 0.032*"trump" + 0.032*"human"
Topic: 1 Word: 0.131*"climat" + 0.061*"earth" + 0.044*"greenhous" + 0.039*"affect" + 0.037*"scienc" + 0.034*"atmospher" + 0.028*"chang" + 0.026*"cloud" + 0.025*"believ" + 0.024*"heat"
Topic: 2 Word: 0.121*"ocean" + 0.085*"increas" + 0.046*"happen" + 0.038*"close" + 0.032*"earth" + 0.031*"time" + 0.031*"green" + 0.030*"stop" + 0.029*"temperatur" + 0.027*"gener"
Topic: 3 Word: 0.099*"temperatur" + 0.076*"warm" + 0.067*"global" + 0.038*"record" + 0.029*"data" + 0.029*"problem" + 0.028*"land" + 0.026*"emiss" + 0.025*"peopl" + 0.025*"earth"
Topic: 4 Word: 0.060*"heat" + 0.047*"model" + 0.040*"question" + 0.034*"melt" + 0.032*"climat" + 0.028*"ipcc" + 0.026*"life" + 0.026*"china" + 0.025*"chang" + 0.024*"current"
Topic: 5 Word: 0.196*"chang" + 0.158*"climat" + 0.036*"energi" + 0.028*"denier" + 0.027*"come" + 0.025*"nee

In [21]:
# Performance evaluation by classifying sample document using LDA Bag of Words model
processDf[100]

['vietnam',
 'highland',
 'colder',
 'winter',
 'hotter',
 'summer',
 'lower',
 'lie',
 'area']

In [22]:
for index, score in sorted(lda_model[bow_corpus[100]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.7749512195587158	 
Topic: 0.119*"climat" + 0.094*"temperatur" + 0.048*"year" + 0.043*"averag" + 0.039*"earth" + 0.034*"chang" + 0.023*"global" + 0.022*"feedback" + 0.018*"model" + 0.017*"question"

Score: 0.02501291036605835	 
Topic: 0.070*"climat" + 0.064*"caus" + 0.063*"temperatur" + 0.060*"global" + 0.060*"chang" + 0.049*"like" + 0.025*"weather" + 0.023*"winter" + 0.021*"countri" + 0.021*"atmospher"

Score: 0.025010153651237488	 
Topic: 0.242*"warm" + 0.226*"global" + 0.027*"scienc" + 0.018*"cool" + 0.017*"carbon" + 0.016*"level" + 0.016*"happen" + 0.015*"atmospher" + 0.014*"earth" + 0.014*"water"

Score: 0.025008363649249077	 
Topic: 0.225*"climat" + 0.180*"chang" + 0.056*"earth" + 0.048*"affect" + 0.035*"natur" + 0.030*"energi" + 0.018*"theori" + 0.018*"atmospher" + 0.017*"peopl" + 0.016*"trump"

Score: 0.025007443502545357	 
Topic: 0.109*"greenhous" + 0.102*"effect" + 0.044*"close" + 0.040*"time" + 0.039*"carbon" + 0.029*"fuel" + 0.028*"plant" + 0.027*"ga" + 0.027*"atmo

In [23]:
# Performance evaluation by classifying sample document using LDA TF-IDF model.
for index, score in sorted(lda_model_tfidf[bow_corpus[100]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.5552629232406616	 
Topic: 0.131*"climat" + 0.061*"earth" + 0.044*"greenhous" + 0.039*"affect" + 0.037*"scienc" + 0.034*"atmospher" + 0.028*"chang" + 0.026*"cloud" + 0.025*"believ" + 0.024*"heat"

Score: 0.2446936070919037	 
Topic: 0.120*"climat" + 0.108*"chang" + 0.045*"weather" + 0.045*"impact" + 0.039*"model" + 0.032*"increas" + 0.031*"radiat" + 0.028*"flood" + 0.026*"caus" + 0.023*"natur"

Score: 0.02502145804464817	 
Topic: 0.099*"temperatur" + 0.076*"warm" + 0.067*"global" + 0.038*"record" + 0.029*"data" + 0.029*"problem" + 0.028*"land" + 0.026*"emiss" + 0.025*"peopl" + 0.025*"earth"

Score: 0.025007816031575203	 
Topic: 0.069*"differ" + 0.060*"year" + 0.054*"planet" + 0.042*"protest" + 0.040*"futur" + 0.037*"cycl" + 0.034*"better" + 0.029*"save" + 0.027*"citi" + 0.026*"arctic"

Score: 0.025004912167787552	 
Topic: 0.097*"global" + 0.089*"warm" + 0.053*"rise" + 0.047*"caus" + 0.045*"climat" + 0.043*"level" + 0.039*"scientist" + 0.034*"chang" + 0.032*"trump" + 0.032*"huma

In [24]:
# Testing model on unseen document
unseen_document = 'How a climate will be changed next few years'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.48230278491973877	 Topic: 0.225*"climat" + 0.180*"chang" + 0.056*"earth" + 0.048*"affect" + 0.035*"natur"
Score: 0.31766024231910706	 Topic: 0.119*"climat" + 0.094*"temperatur" + 0.048*"year" + 0.043*"averag" + 0.039*"earth"
Score: 0.025010138750076294	 Topic: 0.101*"climat" + 0.090*"chang" + 0.038*"year" + 0.034*"melt" + 0.028*"measur"
Score: 0.02500765025615692	 Topic: 0.218*"climat" + 0.135*"chang" + 0.039*"model" + 0.036*"data" + 0.035*"temperatur"
Score: 0.02500654011964798	 Topic: 0.100*"climat" + 0.083*"chang" + 0.079*"ocean" + 0.060*"heat" + 0.035*"go"
Score: 0.025006357580423355	 Topic: 0.172*"climat" + 0.125*"chang" + 0.067*"world" + 0.032*"scientist" + 0.030*"water"
Score: 0.02500297501683235	 Topic: 0.070*"climat" + 0.064*"caus" + 0.063*"temperatur" + 0.060*"global" + 0.060*"chang"
Score: 0.025001974776387215	 Topic: 0.073*"earth" + 0.051*"climat" + 0.038*"chang" + 0.037*"emiss" + 0.037*"problem"
Score: 0.025000790134072304	 Topic: 0.242*"warm" + 0.226*"global" + 0