In [1]:
# Text as data: what are strings?

sentence = 'a string is a sequence of characters'

words = ['these', 'strings', 'are', 'stored', 'as', 'a', 'list']
words.append('!')

print(sentence) #print is a function, hence parentheses ()
print(words)
print(sentence.split())

a string is a sequence of characters
['these', 'strings', 'are', 'stored', 'as', 'a', 'list', '!']
['a', 'string', 'is', 'a', 'sequence', 'of', 'characters']


In [2]:
type(sentence)

str

In [3]:
type(words)

list

In [4]:
type(words[0]) # first item in list

str

In [5]:
print(len(sentence)) # length of characters
print(len(sentence.split())) # length of list

36
7


In [6]:
# let's use a random tweet from no one in particular

tweet = """Why does the Mueller team have 13 hardened Democrats, 
 some big Crooked Hillary supporters, and Zero Republicans? 
 Another Dem recently added. does anyone think this is fair? 
 And yet, there is NO COLLUSION!"""

# Triple quotes are used for strings that break across multiple lines

print(tweet)

Why does the Mueller team have 13 hardened Democrats, 
 some big Crooked Hillary supporters, and Zero Republicans? 
 Another Dem recently added. does anyone think this is fair? 
 And yet, there is NO COLLUSION!


In [None]:
type(tweet)

In [None]:
# string functions

tweet.lower()

In [None]:
# string functions

tweet.upper()

In [None]:
# string functions

tweet.replace('Hillary','Clinton')

In [None]:
# you can combine functions

tweet.replace('Hillary','Clinton').lower().split() # note the order

In [None]:
tweet.lower().replace('Hillary','Clinton').split() # hillary doesn't get replaced in this case

In [None]:
# lists of filter words

stop_words = ['the', 'it', 'is', 'a', 'was', 'and', 
             'why', 'what', 'how', 'has', 'have', 'this', 'that']

punctuation = [".", "," , "?", "!", "#", "$", '\n']

positive_words = ['fair','good', 'nice', 'super', 'fun', 'delightful', 'supporters']
negative_words = ['no', 'crooked', 'collusion', 'bad', 'sad']

In [None]:
# let's do some simple pre-processing of our tweet

tweet_processed=tweet.lower() #convert to lowercase

for p in punctuation:
    tweet_processed=tweet_processed.replace(p,'') # erase punctuation

words = tweet_processed.split() # store the processed tweet in an object called 'word'

# remove stopwords and store remaining words in a new list
results = []
for word in words:
    if word not in stop_words:
        results.append(word)

print(results)

In [None]:
# list comprehension achieves same result as above in a single line

list_comp = [word for word in words if word not in punctuation and word not in stop_words]

list_comp == results

In [None]:
print(list_comp)

In [None]:
#which words are positive or negative?

for word in words:
    if word in positive_words:
        print(word + ' is a positive word')
        
for word in words:
    if word in negative_words:
        print(word + ' is a negative word')

In [None]:
# NLTK: The Natural Language Tool Kit

import nltk
nltk.corpus.gutenberg.fileids() # these are the book corpora included in nltk

In [None]:
hamlet_words = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt') #list of words
hamlet_sents = nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt') # list of sentences, each sentence is list
hamlet_paras = nltk.corpus.gutenberg.paras('shakespeare-hamlet.txt') # list of pararaphs, each paragraph is a list

In [None]:
print(hamlet_words[0:5])
print(hamlet_sents[0:5])
print(hamlet_paras[0:5])

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')[:20] #the brackets are to index the first 20 items in the list

In [None]:
[word for word in hamlet_words if word.lower() not in stopwords.words('english')][:10] # first 10 items in the list

In [None]:
[word.lower() for word in hamlet_words if word.isalpha()][:10] 

In [None]:
from nltk.stem.porter import PorterStemmer
porter = nltk.PorterStemmer()

print([word for word in hamlet_words[80:110]])
print([porter.stem(word) for word in hamlet_words[80:110]])

In [None]:
text = nltk.word_tokenize("I'm gonna tokenize this sentence into a list of words")
text

In [None]:
CORPUS_PATH = "/Users/Fiona_Shen_Bayh/nltk_data/corpora/state_union/"

filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in os.listdir(CORPUS_PATH)])

filenames

In [None]:
filenames.pop(-1)

In [None]:
filenames

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.feature_extraction.text as text

""" CountVectorizer class gathers word frequencies (or term frequencies) associated with texts into a document-term matrix.
    min_df: discard words appearing in less than n documents, in large corpus this may be set to 15 or higher to eliminate very rare words
    max_df: discard words appearing in more than n documents
    decode_error: if a byte sequence contains characters that aren't part of the given encoding, a UnicodeDecodeError will be raised. To prevent this, specify ‘ignore’ or ‘replace’.
"""

vectorizer = text.CountVectorizer(input='filename', min_df=10, max_df=.95, decode_error='replace', stop_words='english')

#Document term matrix    
""" all_the_docs is a dataframe, but vectorizer needs string as input
    so we convert the dataframe to a unicode string object"""

dtm = vectorizer.fit_transform(filenames).toarray()
vocab = vectorizer.get_feature_names()
    
""" Now we have a document-term matrix (dtm) and a vocabulary list (vocab)."""

""" Convert vocab, a list storing the vocabulary words, into a NumPy array, because an array supports a greater variety of operations than a list."""

import numpy as np
vocab = np.array(vocab)

dtm.shape

In [None]:
##################
# TOPIC MODELING #
##################

""" A “non-negative matrix” is a matrix containing non-negative values (i.e. zero or positive word frequencies).
    Non-negative matrix factorization (NMF) is often characterized as a machine learning algorithm
    It strongly resembles Latent Dirichlet Allocation (LDA), which is a probabilistic model of the corpus
    Whereas LDA expresses uncertainty about placement of topics across texts and assignment of words to topics,
        NMF is a deterministic algorithm that arrives at a single representation of the corpus
        Both NMF and LDA take a corpus and uncover “latent topics” 
        
        In what follows, let's start with an NMF topic model"""

from sklearn import decomposition

""" Here we will use NMF to get a document-topic matrix (topics here will also be referred to as “components”) 
    and a list of top words for each topic."""

num_topics = 10 #number of latent topics

num_top_words = 20 #number of words per topic

clf = decomposition.NMF(n_components=num_topics, random_state=1) #plug these components into the algorithm

doctopic = clf.fit_transform(dtm) #plug the dtm into the algorithm

topic_words = [] #create an empty list for our topic words

for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])
    
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True) # scale the document-component matrix such that the component values associated with each document sum to one

In [None]:
presidents = []

for fn in filenames:
    basename = os.path.basename(fn)
    name, ext = os.path.splitext(basename)
    name = name.lstrip('0123456789-')
    name = name.rstrip('-123')
    presidents.append(name) 

# turn this into an array so we can use NumPy functions
presidents = np.asarray(presidents)

doctopic_orig = doctopic.copy()

# preprocess
num_groups = len(set(presidents))

doctopic_grouped = np.zeros((num_groups, num_topics))

for i, name in enumerate(sorted(set(presidents))):
    doctopic_grouped[i, :] = np.mean(doctopic[presidents == name, :], axis=0)

doctopic = doctopic_grouped

group = sorted(set(presidents)) 

print("Top NMF topics in...")

""" Topic shares associated with a set of documents can be interpreted in terms of word frequencies, 
    i.e. how many times a given word appears in a given topic
    Python uses 0-based indexing, so the first topic is topic 0."""

for i in range(len(doctopic)):
    top_topics = np.argsort(doctopic[i,:])[::-1][0:3]
    top_topics_str = ' '.join(str(t) for t in top_topics)
    print("{}: {}".format(group[i], top_topics_str)) # the numbers represent the top 3 topics for each search document
    
for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ' '.join(topic_words[t][:20]))) #prints top 15 words associated with each topic

In [None]:
import numpy as np

import matplotlib.pyplot as plt

N, K = doctopic.shape  # N documents, K topics

ind = np.arange(N)  # the x-axis locations for the courts

width = 0.2  # the width of the bars

plots = []

height_cumulative = np.zeros(N)

for k in range(K):
    color = plt.cm.coolwarm(k/K, 1) #colormap
    if k == 0:
        p = plt.bar(ind, doctopic[:, k], width, color=color)
    else:
        p = plt.bar(ind, doctopic[:, k], width, bottom=height_cumulative, color=color)
    height_cumulative += doctopic[:, k]
    plots.append(p)

plt.ylim((0, 1))  # proportions sum to 1, so the height of the stacked bars is 1

plt.ylabel('') #y-axis label

plt.title('Topics') #plot title

plt.xticks(ind+width/2, group) #placement and naming of x-axis ticks

topic_labels = ['Topic #{}'.format(k) for k in range(K)] # {} will become topic 1:K


# see http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend for more details on making a legend in matplotlib

plt.legend([p[0] for p in plots], topic_labels, bbox_to_anchor=(1.4,0.8), title="Topics")

plt.subplots_adjust(bottom=0.3)

plt.show()

In [None]:
import matplotlib
import matplotlib.pyplot as plt

N, K = doctopic.shape # N documents, K topics

ind = np.arange(N)  # points on the x-axis

width = 0.5

plt.bar(ind, doctopic[:,2], width=width)

xlabels = set(presidents)

plt.xticks(ind + width/2, group, rotation=45)  # put labels in the center

plt.title('Share of Topic #2')

plt.show()