In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models
import nltk
import feedparser




In [3]:
class IdentifyingTopicExample:
    def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url))
        
    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Cleaning {} documents completed".format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned)
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary)
        print(ldamodel.print_topics(num_topics=2, num_words=4))
        
    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()




In [4]:
if __name__ == '__main__':
    topicExample = IdentifyingTopicExample()
    topicExample.run()


-- Peter Moylan suffered his first loss in 208 appearances, spanning eight seasons, on Friday night — the second-longest streak in MLB history.
-- June 24(Reuters) - Reuters U.S. sports schedule at 2:15 PM ET on Sunday:
-- Dane Dunning left a start with Double-A Birmingham on Saturday due to elbow soreness and the White Sox gave a preliminary update on his status.
-- Freddie Freeman puts the Braves on the board in the 1st, as he lines a single into left field to drive in Ender Inciarte and Ozzie Albies
-- Miguel Andujar belts a three-run homer to left field for his 11th of the year, giving the Yankees a 3-2 lead in the 2nd inning
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Cleaning 5 documents completed
[(0, '0.024*"left" + 0.021*"field" + 0.021*"yankees" + 0.021*"run"'), (1, '0.033*"left" + 0.024*"reuters" + 0.023*"field" + 0.020*"st"')]


In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models
import nltk
import feedparser

class IdentifyingTopicExample:
    def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetching documents from {} completed".format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents)))

    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned)
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary)
        print(ldamodel.print_topics(num_topics=2, num_words=4))

    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()

if __name__ == '__main__':
    topicExample = IdentifyingTopicExample()
    topicExample.run()


-- Peter Moylan suffered his first loss in 208 appearances, spanning eight seasons, on Friday night — the second-longest streak in MLB history.
-- June 24(Reuters) - Reuters U.S. sports schedule at 2:15 PM ET on Sunday:
-- Dane Dunning left a start with Double-A Birmingham on Saturday due to elbow soreness and the White Sox gave a preliminary update on his status.
-- Freddie Freeman puts the Braves on the board in the 1st, as he lines a single into left field to drive in Ender Inciarte and Ozzie Albies
-- Miguel Andujar belts a three-run homer to left field for his 11th of the year, giving the Yankees a 3-2 lead in the 2nd inning
INFO: Fetching documents from https://sports.yahoo.com/mlb/rss.xml completed
INFO: Clearning 5 documents completed
[(0, '0.034*"left" + 0.032*"field" + 0.021*"single" + 0.021*"albies"'), (1, '0.034*"reuters" + 0.024*"left" + 0.021*"sports" + 0.021*"u"')]
