In [2]:
import pandas as pd
import numpy as np
import math
from numpy import linalg as LA
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
from sklearn.decomposition import NMF,LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [3]:
# list of text documents for 1 author
text = []
authors = 10
#for i in range(authors):
for j in range(16,21):
    filename = "articles/articles/test/author_"+str(authors)+"/"+str(j)+".txt"
    with open(filename, 'r' ,encoding = "utf8") as file:
        sentence = file.read()
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        text.append(sentence.lower())                       

In [4]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [15]:
def tokenizer_lemmatizer(sentence):
    lemmatizer = WordNetLemmatizer()
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

In [6]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(text)
tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 3

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [8]:
df = pd.DataFrame(lda.transform(tf))
print(np.mean(df,axis=0))

0    0.404514
1    0.213036
2    0.382450
dtype: float64


In [9]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
#display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
got tourney cool home nt today game cities hey fun
Topic 1:
school summer game did watched love got nt home hung
Topic 2:
just games cities summer weekend tourney busy hey later love


In [20]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
  

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sailalitha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sailalitha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [21]:
cleaned_text = [tokenizer_lemmatizer(sentence) for sentence in text]
print(cleaned_text)

[['hey', 'my', 'weekend', 'be', 'pretty', 'good', 'my', 'mom', 'and', 'i', 'drive', 'down', 'on', 'friday', 'night', 'and', 'it', 'be', 'stormy', 'and', 'we', 'get', 'there', 'and', 'i', 'find', 'a', 'lot', 'of', 'girl', 'on', 'my', 'soccer', 'team', 'we', 'all', 'get', 'in', 'our', 'bikini', 'and', 'hung', 'out', 'at', 'the', 'pool', 'and', 'meet', 'some', 'guy', 'that', 'be', 'at', 'a', 'state', 'track', 'meet', 'they', 'be', 'really', 'hot', 'with', 'ripped', 'stomach', 'muscle', 'and', 'prolly', '16', 'or', '17', 'we', 'saw', 'them', 'again', 'later', 'and', 'everyone', 'be', 'play', 'it', 'all', 'cool', 'as', 'we', 'flirt', 'with', 'them', 'and', 'i', 'have', 'to', 'be', 'dumb', 'and', 'blurt', 'out', 'why', 'be', 'your', 'shirt', 'on', 'i', 'm', 'just', 'too', 'cool', 'on', 'saturday', 'we', 'lose', 'two', 'game', 'which', 'suck', 'cuz', 'the', 'team', 'we', 'palyed', 'be', 'nt', 'good', 'but', 'we', 'be', 'miss', 'people', 'and', 'die', 'from', 'the', 'heat', 'lol', 'anyways', '

In [22]:
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(cleaned_text)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in cleaned_text]

In [23]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('16', 1),
  ('17', 1),
  ('5', 1),
  ('50', 1),
  ('a', 10),
  ('abercrombie', 1),
  ('about', 1),
  ('again', 1),
  ('age', 1),
  ('all', 3),
  ('amanda', 1),
  ('american', 1),
  ('an', 1),
  ('and', 33),
  ('angel', 1),
  ('anyways', 1),
  ('as', 1),
  ('assist', 1),
  ('at', 2),
  ('bath', 1),
  ('bball', 1),
  ('be', 14),
  ('belt', 1),
  ('big', 1),
  ('bikini', 1),
  ('blurt', 1),
  ('body', 1),
  ('book', 1),
  ('breanna', 1),
  ('but', 1),
  ('buttonup', 1),
  ('capri', 1),
  ('character', 1),
  ('check', 1),
  ('christine', 1),
  ('cool', 2),
  ('cuz', 1),
  ('date', 1),
  ('die', 1),
  ('do', 1),
  ('down', 1),
  ('drive', 2),
  ('dumb', 1),
  ('eagle', 1),
  ('emily', 1),
  ('everyone', 1),
  ('fall', 1),
  ('find', 1),
  ('first', 2),
  ('flirt', 1),
  ('for', 1),
  ('forward', 1),
  ('friday', 1),
  ('from', 5),
  ('fun', 1),
  ('funny', 1),
  ('g', 1),
  ('game', 3),
  ('gaudy', 1),
  ('get', 8),
  ('girl', 1),
  ('go', 2),
  ('goal', 2),
  ('good', 2),
  ('guy', 2),


In [24]:
from gensim.models.ldamodel import LdaModel
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=100,
                   update_every=1,
                   chunksize=100,
                   passes=10,
                   alpha='auto',
                   per_word_topics=True)

In [25]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.006*"and" + 0.005*"i" + 0.005*"a" + 0.005*"we" + 0.004*"the" + 0.004*"then" + 0.004*"get" + 0.004*"be" + 0.004*"my" + 0.004*"of"'), (1, '0.004*"and" + 0.004*"i" + 0.004*"s" + 0.004*"m" + 0.004*"a" + 0.004*"my" + 0.004*"that" + 0.004*"be" + 0.004*"it" + 0.004*"summer"'), (2, '0.004*"and" + 0.004*"be" + 0.004*"i" + 0.004*"the" + 0.004*"then" + 0.004*"we" + 0.004*"a" + 0.004*"get" + 0.004*"my" + 0.004*"mom"'), (3, '0.039*"the" + 0.026*"i" + 0.026*"a" + 0.026*"be" + 0.026*"have" + 0.026*"game" + 0.026*"in" + 0.026*"city" + 0.026*"ve" + 0.014*"and"'), (4, '0.004*"and" + 0.004*"i" + 0.004*"the" + 0.004*"a" + 0.004*"get" + 0.004*"then" + 0.004*"my" + 0.004*"we" + 0.004*"with" + 0.004*"s"'), (5, '0.004*"and" + 0.004*"i" + 0.004*"a" + 0.004*"be" + 0.004*"we" + 0.004*"the" + 0.004*"get" + 0.004*"then" + 0.004*"my" + 0.004*"of"'), (6, '0.075*"and" + 0.042*"i" + 0.030*"a" + 0.030*"be" + 0.027*"the" + 0.026*"my" + 0.025*"we" + 0.022*"get" + 0.017*"of" + 0.017*"then"'), (7, '0.005*"and" + 0.