In [1]:
# Standard libraries
import pandas as pd

# Text cleaning / preprocessing
import spacy as spy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# LDA
from gensim import corpora
from gensim import models
from gensim import similarities
from gensim import matutils

In [2]:
# Things to try
# - different amounts of topics
# - different corpuses (quotes)

### Get data

In [67]:
#path = 'text/full_text.txt'
path = 'text/quotes.txt'

In [68]:
# f = open(path, encoding='utf-8')
# raw_text = ''

# for line in f:
#     line = line.strip()
#     raw_text += line

In [72]:
# Open file
f = open(path, encoding='utf-8')

# Instantiate documents list
documents = []

# Iterate through each line (document), strip newlines, and append to raw_text list
for line in f:
    documents.append(line.rstrip())

In [73]:
# Get rid of extra newlines that resulted in empty strings
documents = [doc for doc in documents if doc != '']

### Clean + preprocess text

In [74]:
def clean_text(document_string):
    """
    Function that takes in a document in
    the form of a string, and preprocesses
    it, returning a clean string ready
    to be used to fit a CountVectorizer.
    
    Preprocessing includes:
    - lowercasing text
    - eliminating punctuation
    - dealing with edge case punctuation
      and formatting
    - replacing contractions with
      the proper full words
      
    :param: document_string: str
    
    :returns: cleaned_text: str
    """
    # Make text lowercase
    raw_text = document_string.lower()

    # Replace encoding error with a space
    raw_text = raw_text.replace('\xa0', ' ')

    # Replace period with a space
    raw_text = raw_text.replace('.', '')

    # Replace exclamation point with a space
    raw_text = raw_text.replace('!', ' ')

    # Replace slashes with empty
    raw_text = raw_text.replace('/', '')

    # Replace questin marks with empty
    raw_text = raw_text.replace('??', '')
    raw_text = raw_text.replace('?', '')

    # Replace dashes with space
    raw_text = raw_text.replace('-', ' ')
    raw_text = raw_text.replace('—', ' ')

    # Replace ... with empty
    raw_text = raw_text.replace('…', '')
    raw_text = raw_text.replace('...', '')

    # Replace commas with empty
    raw_text = raw_text.replace(',', '')

    # Replace semi-colon with empty
    raw_text = raw_text.replace(';', '')

    # Get rid of brackets
    raw_text = raw_text.replace('[', '')
    raw_text = raw_text.replace(']', '')

    # Replace quotes with nothing
    raw_text = raw_text.replace('“', '')
    raw_text = raw_text.replace('”', '')
    raw_text = raw_text.replace('"', '')
    raw_text = raw_text.replace("‘", "")

    # Get rid of backslashes indicating contractions
    raw_text = raw_text.replace(r'\\', '')

    # Replace extra spaces with single space
    raw_text = raw_text.replace('   ', ' ')
    raw_text = raw_text.replace('  ', ' ')

    # Some apostrophes are of a different type --> ’ instead of '
    raw_text = raw_text.replace("’", "'")

    # Replace contractions with full words, organized alphabetically
    raw_text = raw_text.replace("can't", 'cannot')
    raw_text = raw_text.replace("didn't", 'did not')
    raw_text = raw_text.replace(r"doesn\'t", 'does not')
    raw_text = raw_text.replace("don't", 'do not')
    raw_text = raw_text.replace("he's", 'he is')
    raw_text = raw_text.replace("i'd", 'i would')
    raw_text = raw_text.replace("i'll", 'i will')
    raw_text = raw_text.replace(r"i\'m", 'i am')
    raw_text = raw_text.replace(r"isn\'t", 'is not')
    raw_text = raw_text.replace("it's", 'it is')
    raw_text = raw_text.replace("nobody's", 'nobody is')
    raw_text = raw_text.replace("she's", 'she is')
    raw_text = raw_text.replace("shouldn't", 'should not')
    raw_text = raw_text.replace("that's", 'that is')
    raw_text = raw_text.replace("they're", 'they are')
    raw_text = raw_text.replace("there's", 'there are')
    raw_text = raw_text.replace("we're", 'we are')
    raw_text = raw_text.replace("we've", 'we have')
    raw_text = raw_text.replace("you're", 'you are')
    raw_text = raw_text.replace("you've", 'you have')

    # Fix other contractions
    raw_text = raw_text.replace("'s", ' is')
    
    cleaned_text = raw_text
    
    return(cleaned_text)

In [75]:
cleaned_documents = [clean_text(doc) for doc in documents]

In [76]:
cleaned_documents

['we are all just walking each other home',
 'the quieter you become the more you can hear',
 'we are fascinated by the words but where we meet is in the silence behind them',
 'it is important to expect nothing to take every experience including the negative ones as merely steps on the path and to proceed',
 "the most exquisite paradox as soon as you give it all up you can have it all as long as you want power you cannot have it the minute you do not want power you'll have more than you ever dreamed possible",
 "i would like my life to be a statement of love and compassion and where it isn't that is where my work lies",
 'the heart surrenders everything to the moment the mind judges and holds back',
 'be here now',
 'in most of our human relationships we spend much of our time reassuring one another that our costumes of identity are on straight',
 'your problem is you are too busy holding on to your unworthiness',
 'as long as you have certain desires about how it ought to be you cann

In [7]:
# Input to fit count_vectorizer expects list of string(s)
#clean_text = [raw_text]

# # Get rid of stopwords 
# stoplist =  set(stopwords.words('english'))
# clean_text = [word for word in raw_text.split() if word not in stoplist]

# print(clean_text)

In [77]:
# Create CountVectorizer for parsing / counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                  stop_words='english', token_pattern='\\b[a-z][a-z]+\\b')

count_vectorizer.fit(cleaned_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [79]:
# Create the term-document matrix
# Transpose it so the terms are the rows
counts = count_vectorizer.transform(cleaned_documents).transpose()

In [80]:
counts.shape

(1831, 122)

In [81]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [82]:
id2word = dict((word, rowID) for rowID, word in count_vectorizer.vocabulary_.items())

In [83]:
len(id2word)

1831

### LDA

In [84]:
# Create LDA model
lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

  expElogthetad = np.exp(Elogthetad)


In [85]:
lda.print_topics()

[(0,
  'nan*"planet" + nan*"planet love" + nan*"plane" + nan*"plane relieve" + nan*"planes" + nan*"planes consciousness" + nan*"places" + nan*"point chest" + nan*"play roles" + nan*"placed mountain"'),
 (1,
  'nan*"planet" + nan*"planet love" + nan*"plane" + nan*"plane relieve" + nan*"planes" + nan*"planes consciousness" + nan*"places" + nan*"point chest" + nan*"play roles" + nan*"placed mountain"'),
 (2,
  'nan*"planet" + nan*"planet love" + nan*"plane" + nan*"plane relieve" + nan*"planes" + nan*"planes consciousness" + nan*"places" + nan*"point chest" + nan*"play roles" + nan*"placed mountain"'),
 (3,
  'nan*"planet" + nan*"planet love" + nan*"plane" + nan*"plane relieve" + nan*"planes" + nan*"planes consciousness" + nan*"places" + nan*"point chest" + nan*"play roles" + nan*"placed mountain"'),
 (4,
  'nan*"planet" + nan*"planet love" + nan*"plane" + nan*"plane relieve" + nan*"planes" + nan*"planes consciousness" + nan*"places" + nan*"point chest" + nan*"play roles" + nan*"placed mou