In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
import gensim
import pyLDAvis.gensim
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel,CoherenceModel
from sklearn.decomposition import NMF,LatentDirichletAllocation



In [2]:
df = pd.read_csv('C:/Users/Samarth/Desktop/Mtech AI/NLP/Course/NLP_COURSE_FILES/05-Topic-Modeling/npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
stop_words = list(set(stopwords.words('english')))+list(punctuation)+['’',"”",'—']
lem = WordNetLemmatizer()

In [5]:
def cleaning(text):
    text = text.lower()
    words =  word_tokenize(text)
    words = [w for w in words if len(w)>3]
    words_stop = [w for w in words if w not in stop_words]
    words_lem = [lem.lemmatize(w,'v') for w in words_stop]
    return ' '.join(words_lem)

In [6]:
df['New']  = df['Article'].apply(cleaning)

In [7]:
def cleaning_gensim(text):
    text = text.lower()
    words =  word_tokenize(text)    
    words = [w for w in words if len(w)>3]    
    words_stop = [w for w in words if w not in stop_words]
    words_lem = [lem.lemmatize(w,'v') for w in words_stop]
    return words_lem

In [8]:
text = list(df['Article'].values)

In [9]:
clean_doc = [cleaning_gensim(doc) for doc in text]

In [10]:
x = df['New']

In [11]:
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words = 'english')

In [12]:
x1 = tfidf.fit_transform(x)

In [13]:
x1.shape

(11992, 43357)

In [14]:
nmf = NMF(n_components=7,random_state=42)

In [15]:
nmf.fit(x1)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [16]:
tfidf.get_feature_names()[5]

'01'

In [17]:
for i,index in enumerate(nmf.components_):
    print(i)
    print(' '.join([tfidf.get_feature_names()[k] for k in index.argsort()[-10:]]))

0
want time really make music know people think like say
1
russia republican obama white house donald say campaign president trump
2
drug patients people plan coverage medicaid say insurance care health
3
poll delegate democratic hillary state campaign voters vote sanders clinton
4
people force city kill isis officer attack report say police
5
children college parent kid say teachers student education students school
6
house state vote rule judge president justice supreme senate court


In [18]:
dictionary = Dictionary(clean_doc)

In [19]:
corpus = [dictionary.doc2bow(doc) for doc in clean_doc]

In [20]:
ldamodel = LdaModel(corpus=corpus,id2word=dictionary,num_topics=7,random_state=42,passes=50)

In [21]:
print(ldamodel.print_topics())

[(0, '0.028*"trump" + 0.013*"say" + 0.011*"clinton" + 0.010*"state" + 0.009*"president" + 0.008*"vote" + 0.008*"campaign" + 0.007*"would" + 0.006*"obama" + 0.005*"republican"'), (1, '0.019*"say" + 0.009*"court" + 0.008*"company" + 0.006*"report" + 0.005*"trump" + 0.005*"president" + 0.005*"case" + 0.005*"would" + 0.005*"federal" + 0.005*"department"'), (2, '0.020*"say" + 0.011*"people" + 0.011*"like" + 0.010*"think" + 0.009*"know" + 0.008*"go" + 0.007*"make" + 0.007*"want" + 0.007*"time" + 0.006*"school"'), (3, '0.026*"say" + 0.008*"food" + 0.007*"water" + 0.006*"people" + 0.005*"make" + 0.005*"like" + 0.004*"years" + 0.003*"city" + 0.003*"come" + 0.003*"go"'), (4, '0.021*"say" + 0.012*"health" + 0.009*"people" + 0.007*"study" + 0.007*"percent" + 0.006*"care" + 0.005*"would" + 0.005*"find" + 0.005*"state" + 0.005*"also"'), (5, '0.007*"like" + 0.006*"make" + 0.005*"music" + 0.005*"time" + 0.004*"first" + 0.004*"write" + 0.004*"book" + 0.003*"also" + 0.003*"world" + 0.003*"show"'), (6, '

In [31]:
ldamodel[corpus[0]]

[(0, 0.47595543), (1, 0.38612434), (6, 0.1370497)]

In [36]:
ldamodel.get_term_topics('clinton')

[(0, 0.011518571)]

In [40]:
print(ldamodel.log_perplexity(corpus))

-8.319401912610767


In [39]:
coherence = CoherenceModel(ldamodel,texts=clean_doc,dictionary=dictionary)

In [41]:
print(coherence.get_coherence())

0.37205780214972534
