# Model Description
 I introduced the concept of topic modeling and walked through the code for developing topic model using Latent Dirichlet Allocation (LDA) method in the python using Gensim implementation with snapchat datasets.
 
Each document in LDA has several topics. Similar to Probabilistic latent semantic analysis (pLSA), but assumes that the subjects follow Dirichlet distribution.

For example, let's say that there are documents with many words such as [Quantum Mechanics, Higgs Particle, Maxwell's Equation, Theory of Relativity, Uncertainty Principle] and documents with many words such as [Shakespeare, Tolstoy, Faust, 1984]. We can infer that the first document is related to physics and the second document is related to literature. The reason is that we know that the words in the document represent the subject matter. Without this prior information, the subject cannot be predicted. However, even without prior information, words and subjects are related. The subject is not determined by meaning or perception, but by the likelihood of words labeled with supervised learning.

![](https://byeongkijeong.github.io/img/post_img/2019-06-06-lda/lda.png)

## Importing Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import logging
import gensim
from gensim.models import CoherenceModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Preprocessing datasets

In [None]:
def DataFrame(data):
    data = data[['title','review','date']].copy()
    return data

def preprocessing(data):
    data['title'] = data['title'].astype('str') 
    data['review'] = data['review'].astype('str') 
    
    data["contents"] = data.apply(lambda x: x['title'] + "\n" + x['review'], axis=1)  #title + abstract
    data['contents'] = data['contents'].str.replace("[^a-zA-Z]", " ")
    data['contents'] = data['contents'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    data['contents'] = data['contents'].apply(lambda x: x.lower())
    return data

def stopword(data):
    stop_words = stopwords.words('english')
    non_eng_words = "ˆã?ªã,¢ãƒªã,‰,¹ã?«ã?•ã,ã,abis,ada,adek,ah,aja,ajan,ako,aku,akut,ampang,anak,apa,ape,atau,babit,baek,bagus,bahasekan,bahru,baik,bali,balik,bangun,bantu,banyak,baru,bayar,bb,bbm,beda,beli,berani,berisi,berteman,besok,betul,bgt,bhs,bikin,bila,bintang,bisa,blanje,bleh,blm,boleh,bontot,bos,brg,buat,bukan,cakap,camp,cinta,ckp,coba,cos,cuma,da,dah,dan,dari,dat,degil,deh,den,dengar,dgn,dgr,di,dia,dikit,diluar,doa,dong,duit,dulu,fyp,gag,gak,geisha,gewoon,gila,gimana,gitu,gk,gua,gue,guru,gw,hahaha,hari,haris,harus,hati,hujan,ingat,itu,iya,jadi,jagoin,jajan,jalan,jam,jd,jengok,jodoh,judulnya,juga,kacau,kakgue,kalau,kalo,kamu,kan,kangen,kasi,kasus,kat,kata,kau,kayy,ke,kekurusan,keliru,kerjaan,ketauan,kilat,kita,klo,kmu,knapa,kok,kol,kosong,krn,kuala,kurus,la,lagi,lain,lama,lamaan,lame,lemak,lerr,lge,lho,liat,liatin,libur,lulus,luna,lupa,mah,mahu,makan,mana,mao,masih,mau,mcm,melihat,mencintai,mengerti,menit,mereka,met,mga,milih,msh,mungkin,nabei,nabung,nak,naloka,nama,nana,nangis,nanti,nasi,ne,ngak,ngan,ngantuk,ngerti,ngga,ngn,ni,nih,nk,no,ntar,ntr,ny,nya,nye,orang,pagi,pakai,pake,pakwe,par,pas,pasir,pasti,pe,penting,per,pernah,pertemanan,pgi,pikir,pon,pulak,pulsa,pun,punya,racun,ramas,rase,rindu,rmh,rumah,sabar,sah,sakit,salah,sama,saman,sampe,sanggu,sangka,satu,saya,sebelum,seh,sekarang,selalu,semalam,semlm,semua,sendiri,seorang,sia,sih,skrg,smlm,smpi,suka,suroh,suruh,tak,tali,tangan,tapi,tau,taun,teman,tentu,terperap,terspam,tertipu,tetap,tidur,tlad,tolong,trs,trus,tuh,uang,uda,udah,udh,une,untuk,utk,waktu,ya,yaa,yah,yang,yazid,yg,yosie,yuk."
    non_eng_words = non_eng_words.split(',')
    
    smrt_com_words = "reuters,ap,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec,tech,news,index,mon,tue,wed,thu,fri,sat,'s,a,a's,able,about,above,according,accordingly,across,actually,after,afterwards,again,against,ain't,all,allow,allows,almost,alone,along,already,also,although,always,am,amid,among,amongst,an,and,another,any,anybody,anyhow,anyone,anything,anyway,anyways,anywhere,apart,appear,appreciate,appropriate,are,aren't,around,as,aside,ask,asking,associated,at,available,away,awfully,b,be,became,because,become,becomes,becoming,been,before,beforehand,behind,being,believe,below,beside,besides,best,better,between,beyond,both,brief,but,by,c,c'mon,c's,came,can,can't,cannot,cant,cause,causes,certain,certainly,changes,clearly,co,com,come,comes,concerning,consequently,consider,considering,contain,containing,contains,corresponding,could,couldn't,course,currently,d,definitely,described,despite,did,didn't,different,do,does,doesn't,doing,don't,done,down,downwards,during,e,each,edu,eg,e.g.,eight,either,else,elsewhere,enough,entirely,especially,et,etc,etc.,even,ever,every,everybody,everyone,everything,everywhere,ex,exactly,example,except,f,far,few,fifth,five,followed,following,follows,for,former,formerly,forth,four,from,further,furthermore,g,get,gets,getting,given,gives,go,goes,going,gone,got,gotten,greetings,h,had,hadn't,happens,hardly,has,hasn't,have,haven't,having,he,he's,hello,help,hence,her,here,here's,hereafter,hereby,herein,hereupon,hers,herself,hi,him,himself,his,hither,hopefully,how,howbeit,however,i,i'd,i'll,i'm,i've,ie,i.e.,if,ignored,immediate,in,inasmuch,inc,indeed,indicate,indicated,indicates,inner,insofar,instead,into,inward,is,isn't,it,it'd,it'll,it's,its,itself,j,just,k,keep,keeps,kept,know,knows,known,l,lately,later,latter,latterly,least,less,lest,let,let's,like,liked,likely,little,look,looking,looks,ltd,m,mainly,many,may,maybe,me,mean,meanwhile,merely,might,more,moreover,most,mostly,mr.,ms.,much,must,my,myself,n,namely,nd,near,nearly,necessary,need,needs,neither,never,nevertheless,new,next,nine,no,nobody,non,none,noone,nor,normally,not,nothing,novel,now,nowhere,o,obviously,of,off,often,oh,ok,okay,old,on,once,one,ones,only,onto,or,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,own,p,particular,particularly,per,perhaps,placed,please,plus,possible,presumably,probably,provides,q,que,quite,qv,r,rather,rd,re,really,reasonably,regarding,regardless,regards,relatively,respectively,right,s,said,same,saw,say,saying,says,second,secondly,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sensible,sent,serious,seriously,seven,several,shall,she,should,shouldn't,since,six,so,some,somebody,somehow,someone,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specified,specify,specifying,still,sub,such,sup,sure,t,t's,take,taken,tell,tends,th,than,thank,thanks,thanx,that,that's,thats,the,their,theirs,them,themselves,then,thence,there,there's,thereafter,thereby,therefore,therein,theres,thereupon,these,they,they'd,they'll,they're,they've,think,third,this,thorough,thoroughly,those,though,three,through,throughout,thru,thus,to,together,too,took,toward,towards,tried,tries,truly,try,trying,twice,two,u,un,under,unfortunately,unless,unlikely,until,unto,up,upon,us,use,used,useful,uses,using,usually,uucp,v,value,various,very,via,viz,vs,w,want,wants,was,wasn't,way,we,we'd,we'll,we're,we've,welcome,well,went,were,weren't,what,what's,whatever,when,whence,whenever,where,where's,whereafter,whereas,whereby,wherein,whereupon,wherever,whether,which,while,whither,who,who's,whoever,whole,whom,whose,why,will,willing,wish,with,within,without,won't,wonder,would,would,wouldn't,x,y,yes,yet,you,you'd,you'll,you're,you've,your,yours,yourself,yourselves,z,zero"
    smrt_com_words = smrt_com_words.split(',')
    
    stop_words.extend(non_eng_words)
    stop_words.extend(smrt_com_words)
    stop_words.extend(['ms','mr','http','www','co','html','goo_gl','blog','rt','https','bit','goo','gl','ly',
                       'com','nytimes','ow','new','york','times', 'news','also','even','still','much','day','could',
                       'nytime','washington','photo','section','\'s','inc','washpost', 'thing','something','percent','und',
                       'literature', 'may', 'paper', 'der','die','eine','von','however','elsevier', 'author','well','rights',
                       'reserve','_reserve','reserved','be','que','fur','das','den','auf','ein','des','would','latime','nyt',
                       'say','org','uk','eu','fb','do','govt','pic_twitter','pic','twitter','site','pm','website','twitt',
                       'net','ca','web','cc','lnkd','linkedin','away','soon','maybe','bn','pdf','et','al','wsj','report',
                       'bloomberg','tinyurl','From',',The','snap','chat','snapchat'
                       ])
    
    data['tokens'] = data['contents'].apply(lambda x: x.split())
    data['tokens'] = data['tokens'].apply(lambda x: [item for item in x if item not in stop_words])
    
    return data

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for sent in texts:
                doc = nlp(" ".join(sent))
                texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out


def preprocessed_data(data):
    Ddata = DataFrame(data)
    Pdata = preprocessing(Ddata)
    Sdata = stopword(Pdata)
    Sdata['lemmatized'] = lemmatization(Sdata['tokens'], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return Sdata  

## Find *K* value with coherence score
A set of statements or facts is said to be coherent, if they support each other. Thus, a coherent fact set can be interpreted in a context that covers all or most of the facts. An example of a coherent fact set is “the game is a team sport”, “the game is played with a ball”, “the game demands great physical efforts”

### coehrece score : c_v
![](http://i.stack.imgur.com/VQh9m.png)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step): 
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics) 
        model_list.append(model) 
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') 
        coherence_values.append(coherencemodel.get_coherence())        
    
    return model_list, coherence_values

def find_optimal_number_of_topics(dictionary, corpus, processed_data): 
    limit = 30;
    start = 2; 
    step = 6; 
    
    model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=processed_data, start=start, limit=limit, step=step) 
    x = range(start, limit, step) 
    
    plt.plot(x, coherence_values) 
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score") 
    plt.legend(("coherence_values"), loc='best') 
    plt.show()

In [None]:
data = pd.read_csv("/kaggle/input/10k-snapchat-reviews/Snapchat_app_store_reviews.csv")
data =  preprocessed_data(data)
    
id2word = corpora.Dictionary(data['lemmatized'])
token = data['lemmatized']
text = data['contents']
    
corpus = [id2word.doc2bow(text) for text in (token)]
processed_data = token
dictionary = id2word
    
dictionary.filter_extremes(no_below=10, no_above=0.05) 
corpus = [dictionary.doc2bow(text) for text in processed_data] 
    
#print('Number of unique tokens: %d' % len(dictionary)) 
#print('Number of documents: %d' % len(corpus)) 
    
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)     
find_optimal_number_of_topics(dictionary, corpus, processed_data)

## Create LDA Model and Visualizing with PyLDAvis

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=14, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha=0.01,
                                       eta=0.9)
    
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)