In [206]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from pprint import pprint
import nltk
import re
import csv
import pandas as pd
import numpy as np
from langdetect import detect
np.random.seed(2018)

In [207]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/sagban/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [208]:
# Importing Files
file1 = "./CSV/news1.csv"
file2 = "./CSV/news2.csv"
file3 = "./CSV/news3.csv"
file4 = "./CSV/news4.csv"
file5 = "./CSV/news5.csv"
file6 = "./CSV/news6.csv"
file7 = "./CSV/news7.csv"
file8 = "./CSV/news8.csv"
file9 = "./CSV/news9.csv"
file10 = "./CSV/news10.csv"
file11 = "./CSV/news11.csv"
file12 = "./CSV/news12.csv"
files = [file1, file2, file3, file4, file5, file6, file7, file8, file9, file10, file11, file12]

In [209]:
def is_hindi(text):
    if detect(text) == "hi":
        return True
    return False

In [210]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

In [211]:
def rem_user(t):
    t = re.sub('@[^\s]+','',t)
    return re.sub('@[^\s]+','',t)

In [212]:
def spellcheck(text):
    return  re.sub(r'[^a-z]', '', text)

In [213]:
def remove_url(text):
    result = re.sub(r"http\S+", "", text)
    return result

In [214]:
def preprocess(text):
    text = remove_url(text)
    text = rem_user(text)
    result = []
    if is_hindi(text) == False:
        for token in gensim.utils.simple_preprocess(text, True):
            if token not in gensim.parsing.preprocessing.STOPWORDS:
                lem = lemmatize(token)
                if len(lem)>3:
                    result.append(lem)
            
    return result

In [199]:
# Return preprocessed doc
def preprocess_doc(file):
    data = pd.read_csv(file, error_bad_lines=False, encoding='utf-8');
    data_text = data[['text']].copy()
    data_text["index"] = data_text.index
    data_text["date"] = data[['date']]
    processed_docs = data_text
    processed_docs['text'] = processed_docs['text'].map(preprocess)
    return processed_docs

In [215]:
def make_dictionary(processed_docs):
    dictionary = gensim.corpora.Dictionary(processed_docs['text'])
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    return dictionary

In [216]:
# print(bow_corpus[410])
#     bow_doc_43 = bow_corpus[23]
#     for i in range(len(bow_doc_43)):
#         print("Word {} (\"{}\") appears {} time.".format(bow_doc_43[i][0],dictionary[bow_doc_43[i][0]], bow_doc_43[i][1]))

In [217]:
#     Saving the output in the csv file
def save_output(lda_model, output_file):
    with open(output_file, 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows([['Topic','Values']])
        total = []
        for idx,topic in lda_model.print_topics(-1):
            topiclist = []
            topiclist.append(str(idx+1))
            for x in topic.split("+"):
                topiclist.append(str(x.split("*")[1].replace("\"", "").replace(" ", "")))
    #         print("Topic: ",topiclist)
            total.append(topiclist)
        writer.writerows(list(total))
    writeFile.close()

In [218]:
def bow_model(file, i):
    processed_docs = preprocess_doc(file)
    dictionary = make_dictionary(processed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs['text']]
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))
    
    output_file = "results_bow_news"+str(i)+".csv"
    save_output(lda_model, output_file)

    

In [219]:
def tfidf_model(file, i):
    processed_docs = preprocess_doc(file)
    dictionary = make_dictionary(processed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs['text']]
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
    for idx, topic in lda_model_tfidf.print_topics(-1):
        print('\nTopic: {}\t \nWord: {}'.format(idx, topic))
    
    output_file = "results_tfidf_news"+str(i)+".csv"
    save_output(lda_model_tfidf, output_file)

In [220]:
for i, file in enumerate(files):
    print("\n\nPrinting topics for file "+str(i+1))
    print("\nBow Model:")
    bow_model(file, i+1)
    print("\nTfidf Model:")
    tfidf_model(file, i+1)



Printing topics for file 1

Bow Model:
Topic: 0 
Words: 0.036*"state" + 0.017*"karnataka" + 0.014*"private" + 0.013*"crore" + 0.012*"insurance" + 0.012*"healthcare" + 0.011*"year" + 0.010*"arogya" + 0.009*"fund" + 0.008*"sector"
Topic: 1 
Words: 0.020*"india" + 0.020*"demonetisation" + 0.020*"income" + 0.015*"wrong" + 0.015*"years" + 0.011*"need" + 0.010*"poor" + 0.010*"support" + 0.010*"magic" + 0.010*"neglect"
Topic: 2 
Words: 0.021*"healthcare" + 0.016*"goyal" + 0.015*"care" + 0.013*"provide" + 0.012*"lakh" + 0.012*"india" + 0.010*"come" + 0.009*"patients" + 0.009*"country" + 0.009*"programme"
Topic: 3 
Words: 0.025*"healthcare" + 0.014*"people" + 0.011*"state" + 0.011*"cover" + 0.010*"year" + 0.008*"family" + 0.008*"come" + 0.008*"pmjay" + 0.007*"hwcs" + 0.007*"india"
Topic: 4 
Words: 0.049*"state" + 0.012*"centre" + 0.012*"insurance" + 0.010*"central" + 0.009*"claim" + 0.009*"national" + 0.008*"fund" + 0.008*"pmjay" + 0.008*"crore" + 0.008*"cent"
Topic: 5 
Words: 0.024*"people" 


Topic: 0	 
Word: 0.011*"hwcs" + 0.011*"pmjay" + 0.009*"integration" + 0.007*"state" + 0.006*"facilities" + 0.006*"beneficiaries" + 0.005*"healthcare" + 0.005*"movement" + 0.005*"effective" + 0.005*"come"

Topic: 1	 
Word: 0.017*"nathealth" + 0.014*"collaboration" + 0.010*"quality" + 0.009*"exercise" + 0.009*"input" + 0.008*"package" + 0.006*"private" + 0.006*"meet" + 0.006*"protocols" + 0.006*"meaningful"

Topic: 2	 
Word: 0.011*"cookie" + 0.011*"privacy" + 0.010*"goyal" + 0.009*"update" + 0.009*"cookies" + 0.008*"income" + 0.008*"institute" + 0.007*"website" + 0.007*"demonetisation" + 0.007*"sanction"

Topic: 3	 
Word: 0.015*"care" + 0.012*"distress" + 0.012*"provide" + 0.012*"fund" + 0.011*"billion" + 0.009*"crore" + 0.008*"people" + 0.008*"finance" + 0.008*"free" + 0.007*"dollar"

Topic: 4	 
Word: 0.011*"screen" + 0.011*"reach" + 0.010*"areas" + 0.010*"uber" + 0.010*"ministry" + 0.009*"identify" + 0.009*"report" + 0.008*"pradesh" + 0.008*"hwcs" + 0.008*"million"

Topic: 5	 
Word: 0

Topic: 0 
Words: 0.068*"state" + 0.025*"insurance" + 0.023*"time" + 0.020*"policy" + 0.019*"cookie" + 0.017*"claim" + 0.017*"pmjay" + 0.017*"merge" + 0.015*"arogya" + 0.014*"centre"
Topic: 1 
Words: 0.023*"country" + 0.018*"state" + 0.017*"people" + 0.017*"private" + 0.017*"medical" + 0.016*"provide" + 0.015*"treatment" + 0.014*"communicable" + 0.013*"launch" + 0.012*"hwcs"
Topic: 2 
Words: 0.024*"hwcs" + 0.024*"pmjay" + 0.014*"percent" + 0.013*"centre" + 0.013*"treatment" + 0.013*"people" + 0.013*"ministry" + 0.012*"crore" + 0.012*"screen" + 0.012*"national"
Topic: 3 
Words: 0.050*"state" + 0.029*"crore" + 0.026*"karnataka" + 0.017*"year" + 0.016*"fund" + 0.015*"private" + 0.015*"provide" + 0.014*"healthcare" + 0.014*"central" + 0.012*"public"
Topic: 4 
Words: 0.044*"healthcare" + 0.021*"quality" + 0.016*"private" + 0.016*"india" + 0.016*"provide" + 0.014*"lakh" + 0.013*"come" + 0.013*"package" + 0.012*"care" + 0.011*"cover"
Topic: 5 
Words: 0.041*"state" + 0.029*"centre" + 0.027*"let


Topic: 0	 
Word: 0.021*"centre" + 0.019*"wellness" + 0.014*"diseases" + 0.013*"care" + 0.012*"primary" + 0.012*"service" + 0.010*"hwcs" + 0.010*"communicable" + 0.010*"areas" + 0.009*"urban"

Topic: 1	 
Word: 0.018*"crore" + 0.017*"offer" + 0.016*"provide" + 0.015*"distress" + 0.014*"fund" + 0.012*"present" + 0.012*"billion" + 0.012*"healthcare" + 0.010*"families" + 0.010*"identify"

Topic: 2	 
Word: 0.019*"minister" + 0.017*"provide" + 0.017*"facilities" + 0.016*"care" + 0.015*"people" + 0.014*"better" + 0.013*"centre" + 0.012*"fund" + 0.012*"friday" + 0.011*"state"

Topic: 3	 
Word: 0.019*"healthcare" + 0.015*"industry" + 0.013*"sector" + 0.012*"india" + 0.011*"achieve" + 0.011*"better" + 0.011*"time" + 0.010*"change" + 0.010*"policy" + 0.010*"bhushan"

Topic: 4	 
Word: 0.034*"karnataka" + 0.018*"state" + 0.015*"crore" + 0.013*"private" + 0.012*"public" + 0.009*"year" + 0.009*"allocation" + 0.009*"central" + 0.009*"fund" + 0.008*"expenditure"

Topic: 5	 
Word: 0.040*"state" + 0.018*

Topic: 0 
Words: 0.015*"healthcare" + 0.014*"nathealth" + 0.013*"quality" + 0.012*"private" + 0.011*"provide" + 0.011*"collaboration" + 0.011*"state" + 0.010*"package" + 0.009*"treatment" + 0.008*"hospitals"
Topic: 1 
Words: 0.026*"pmjay" + 0.021*"hwcs" + 0.012*"healthcare" + 0.011*"integration" + 0.010*"field" + 0.010*"patients" + 0.009*"beneficiaries" + 0.009*"crore" + 0.009*"create" + 0.008*"women"
Topic: 2 
Words: 0.016*"centre" + 0.013*"service" + 0.012*"wellness" + 0.010*"electric" + 0.009*"hwcs" + 0.009*"include" + 0.008*"software" + 0.008*"partner" + 0.008*"company" + 0.007*"policy"
Topic: 3 
Words: 0.032*"centre" + 0.028*"state" + 0.016*"crore" + 0.015*"service" + 0.010*"wellness" + 0.010*"hwcs" + 0.009*"primary" + 0.009*"hospitals" + 0.009*"beneficiary" + 0.008*"provide"
Topic: 4 
Words: 0.017*"healthcare" + 0.013*"sector" + 0.011*"primary" + 0.008*"increase" + 0.008*"amartya" + 0.007*"modi" + 0.007*"need" + 0.007*"centre" + 0.006*"opportunities" + 0.006*"ministry"
Topic: 5 
