In [11]:
import os
import glob
import sys
import argparse
import json
import errno
import gensim
from nltk import *
from gensim import corpora, models
# files = glob.glob(p + "/data/1.txt")

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string


import re
tokenizer = RegexpTokenizer(r'\w+')

p=os.getcwd()
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatize = WordNetLemmatizer()

def cleaning_text(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = tokenizer.tokenize(letters_only.lower())
    stops = set(nltk.corpus.stopwords.words("english"))
    _words = [w for w in words if not w in stopwords]
    return _words

def cleaning(article):
    zero = re.sub("[^a-zA-Z]", " ", article)
    one = " ".join([i for i in zero.lower().split() if i not in stopwords])
    two = "".join(i for i in one if i not in punctuation)
    three = " ".join(lemmatize.lemmatize(i) for i in two.split())
    return three

cleaned_text = []
def loadtext(testfiles):
     for name in testfiles:
        try:
            with open(name, 'r',encoding='utf-8') as f:
             for line in f:
                if line.startswith("[OCR_aligned]"):
                    raw_corpus = line[len("[OCR_aligned]") + 1:]
                    # raw_corpus = cleaning(raw_corpus)
                    cleaned_text.append(raw_corpus)
        except IOError as exc: #Not sure what error this is
            if exc.errno != errno.EISDIR:
                raise

In [21]:
def main():
    parser = argparse.ArgumentParser(description="ICDAR 2017 Post-OCR Processing Script", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # parser.add_argument('-i','--input', type=str, help="Input file or directory (*.txt files)", action='store',required=True)
    # args = parser.parse_args()

    if os.path.isdir(p):
        testfiles = []
        for f in glob.glob(p+ "/data/en_periodicals/*.txt"):
            testfiles.append(f)

    else:
        testfiles = [p]

    loadtext(testfiles)
    # print(cleaned_text)
   
    # text = loadtext(testfiles)
    # print(text)
main()

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

documents = cleaned_text

no_features = 10000
no_topics = 20
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [23]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=4, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 25
print("\nTopics in LDA model:\n")
display_topics(lda, tf_feature_names, no_top_words)

print("\nTopics in NMF model:\n")
display_topics(nmf, tfidf_feature_names, no_top_words)




Topics in LDA model:

Topic 0:
mr esq new tbe said great london tho house street place time 000 men having year day man general little saturday th public john present
Topic 1:
thé man said little good corsica like time old great day did long way love life lady thou mr shall oliver thy know say road
Topic 2:
mr tbe street new great london day time esq house said general man 000 sir court having men th place present state aud tho country
Topic 3:
said hee man thou rush good did fryer great bacon doe went came dl quoth time come haue thee shall thy king wife goe againe
Topic 4:
mr tbe street new day time said house london th great tho tlie having place man 000 public year mrs st john present years aud
Topic 5:
mr tbe having new great time good tlie day man street said present th ot year place public aud house saturday tho 000 took following
Topic 6:
ceylindo fo mr cure aud moft starch years luxe london medicine day powell tiff tbe remedy mazawattee balsam free iu pills chemist st ot drop